Exemple #1
0
def gen_episode_video(models_path, map_type, n_agents, video_path):
    os.makedirs(video_path + "/imgs", exist_ok=True)

    STEPS = 1000

    selected_map = MAP[map_type]
    AGENT_VIEW_RANGE = 5

    # Create environment
    env = HarvestCommonsEnv(ascii_map=selected_map, num_agents=n_agents, render=True,
                            agent_view_range=AGENT_VIEW_RANGE)

    # Instanciate DDQN agent models
    ddqn_models = {}
    for agent_id, agent in env.agents.items():
        obs_shape = (2 * AGENT_VIEW_RANGE + 1, 2 * AGENT_VIEW_RANGE + 1, 3)
        model_path = os.path.join(models_path, agent_id)
        ddqn_models[agent_id] = DDQNAgent.from_trained_policy(path_to_model=model_path, obs_shape=obs_shape,
                                                              env=env)

    obs = env.reset()

    # Convert initial observations to [0,1] float imgs
    for agent_id in env.agents.keys():
        # Convert observation image from (Int to float)
        obs[agent_id] = (obs[agent_id] / 255.).astype(np.float32)

    for t in tqdm(range(1, STEPS), desc="Steps", position=0, leave=True):
        # Select agent actions to take
        actions = {}
        for agent_id, agent in env.agents.items():
            # Follow policy using Q(s,a) function approximator
            best_action, q_values = ddqn_models[agent_id].model.action_value(tf.expand_dims(obs[agent_id], axis=0))
            actions[agent_id] = ddqn_models[agent_id].get_action(best_action)

        # Apply agents actions on the environment
        next_obs, rewards, dones, info = env.step(actions)

        # Update current state observations
        obs = next_obs

        # Update the environment social metrics
        env.update_social_metrics(rewards)
        env.render(video_path + "/imgs/t=%04d.png" % t, title="t=%04d" % t)

    # Make video of episode
    utility_funcs.make_video_from_image_dir(vid_path=video_path,
                                            img_folder=video_path + "/imgs",
                                            video_name="learned_policy",
                                            fps=10)
    # Delete images
    shutil.rmtree(video_path + "/imgs", ignore_errors=True)
Exemple #2
0
def init_ddqn_agent(input_dims: int, start_state: Tuple[List[int], bool], maze_dim: int) -> keras_sequential_model:
    """
    Function to initialize double deep Q-learning agent.
    Parameters
    ----------
    input_dims : int
    start_state : Tuple[List[int], bool]
    Returns
    -------
    keras_sequential_model
        A keras sequential model object, i.e., a deep neural network
    """
    if start_state is not None:
        start_s = 'fixed'
    else:
        start_s = 'random'
    if input_dims == 2:
        ddqn_agent = DDQNAgent(alpha=0.0005,
                               gamma=0.99,
                               n_actions=4,
                               epsilon=1.0,
                               batch_size=64,
                               input_dims=input_dims,
                               fname=f'ddqn_model_2D_{maze_dim}_{start_s}.h5')

    elif input_dims == 3:  # init ddqn_agent in 3D with the x and y weights of the 2D agent
        ddqn_agent2D = keras.models.load_model(f'ddqn_model_2D_{maze_dim}_{start_s}.h5')

        ddqn_agent = DDQNAgent(alpha=0.0005,
                               gamma=0.99,
                               n_actions=6,
                               epsilon=1.0,
                               batch_size=64,
                               input_dims=input_dims,
                               fname=f'ddqn_model_3D_{maze_dim}_{start_s}.h5')

        # Setting the weights of agent trained in 2D equal to the untrained weights of agent in 3D
        ddqn_agent.q_val = set_weights_3D_2D(ddqn_agent2D, ddqn_agent.q_eval)
        ddqn_agent.q_target = set_weights_3D_2D(ddqn_agent2D, ddqn_agent.q_target)

    return ddqn_agent
Exemple #3
0
# Main method
if __name__ == '__main__':

    # Initialise
    env = make_env('PongNoFrameskip-v4')
    best_score = -np.inf
    load = False
    n_games = 500
    agent = DDQNAgent(gamma=0.99,
                      epsilon=1.0,
                      lr=0.0001,
                      input_dims=(env.observation_space.shape),
                      n_actions=env.action_space.n,
                      mem_size=50000,
                      eps_min=0.1,
                      batch_size=32,
                      replace=1000,
                      eps_dec=1e-5,
                      model_dir='models/',
                      algo='DDQN',
                      env_name='PongNoFrameSkip-v4')

    if load:
        agent.load_models()

    file_name = agent.algo + '_' + agent.env_name + '_lr' + str(
        agent.lr) + '_' + str(n_games) + 'games'
    plot_file = 'plots/' + file_name + '.png'

    n_steps = 0
Exemple #4
0
def train_agents(n_agents=4, map_type="small", logs_path="logs", n_episodes=EPISODES, n_steps=STEPS,
                 batch_size=BATCH_SIZE, lr=0.0015, gamma=0.99, epsilon=0.10, epsilon_decay=0.995, log=True):

    # Configure expermiment logs
    logdir = logs_path + "/MAP=%s-AGENTS=%d-lr=%.5f-e=%.2f-ed=%.3f-g=%.2f-b=%d" % (map_type, n_agents, lr, epsilon,
                                                                                   epsilon_decay, gamma, batch_size)
    os.makedirs(logdir, exist_ok=True)
    # sys.stdout = open(os.path.join(logdir, "console_output.out"), "w+")

    social_metrics_writer = tf.summary.create_file_writer(logdir + "/social_metrics")

    env = HarvestCommonsEnv(ascii_map=MAP[map_type], num_agents=n_agents, render=True,
                            agent_view_range=AGENT_VIEW_RANGE)
    obs = env.reset()

    # Instanciate DDQN agent models
    ddqn_models = {}
    for agent_id, agent in env.agents.items():
        obs_shape = (2 * AGENT_VIEW_RANGE + 1, 2 * AGENT_VIEW_RANGE + 1, 3)
        q_net_model = DeepQNet(num_actions=env.action_space.n, input_shape=obs_shape,
                               kernel_initializer=KERNEL_INITIALIZER)
        q_net_target = DeepQNet(num_actions=env.action_space.n, input_shape=obs_shape,
                                kernel_initializer=KERNEL_INITIALIZER)
        ddqn_models[agent_id] = DDQNAgent(model=q_net_model, target_model=q_net_target, obs_shape=obs_shape,
                                          env=env, buffer_size=REPLAY_BUFFER_SIZE, learning_rate=lr, epsilon=epsilon,
                                          epsilon_decay=epsilon_decay, gamma=gamma, batch_size=batch_size)

    for episode in range(n_episodes + 1):
        start_t = time.time()
        print("- A:%d Episode %d" % (n_agents, episode))
        episode_path = logdir + "/episodes/episode=%04d" % episode
        models_path = logdir + "/model/episode=%04d" % episode
        if episode % EPISODE_RECORD_FREQ == 0:
            os.makedirs(episode_path, exist_ok=True)

        obs = env.reset()

        # Convert initial observations to [0,1] float imgs
        for agent_id in env.agents.keys():
            # Convert observation image from (Int to float)
            obs[agent_id] = (obs[agent_id] / 255.).astype(np.float32)
            # Reset replay buffers
            ddqn_models[agent_id].reset_replay_buffer()
            ddqn_models[agent_id].e_decay()

        for t in tqdm(range(1, n_steps), position=0, leave=True):
            # Select agent actions to take
            actions = {}
            for agent_id, agent in env.agents.items():
                # Follow e greedy policy using Q(s,a) function approximator
                best_action, q_values = ddqn_models[agent_id].model.action_value(tf.expand_dims(obs[agent_id], axis=0))
                actions[agent_id] = ddqn_models[agent_id].get_action(best_action)
            # Apply agents actions on the environment
            next_obs, rewards, dones, info = env.step(actions)
            # Store transition in each agent replay buffer
            for agent_id in env.agents.keys():
                next_obs[agent_id] = (next_obs[agent_id] / 255.).astype(np.float32)
                ddqn_models[agent_id].store_transition(obs[agent_id], actions[agent_id], rewards[agent_id],
                                                       next_obs[agent_id], dones[agent_id])
                ddqn_models[agent_id].num_in_buffer = min(ddqn_models[agent_id].num_in_buffer + 1, REPLAY_BUFFER_SIZE)

            # When enough experience is collected, start on-line learning
            if t > START_LEARNING:
                losses = []
                for agent_id in env.agents.keys():
                    loss = ddqn_models[agent_id].train_step()
                    losses.append(loss)

                if t % TARGET_UPDATE_ITERATION == 0:  # Update target model with learned changes
                    for agent_id in env.agents.keys():
                        ddqn_models[agent_id].update_target_model()

                    # with social_metrics_writer.as_default():
                    # tf.summary.histogram('mse_Q', data=losses, step=t + (n_steps * episode))

            # Update current state observations
            obs = next_obs
            # Update the environment social metrics
            env.update_social_metrics(rewards)

            if episode % EPISODE_RECORD_FREQ == 0:
                env.render(episode_path + "/t=%04d.png" % t, title="t=%04d" % t)

        # Log metrics to tensorboard
        social_metrics = env.get_social_metrics(episode_steps=n_steps)
        efficiency, equality, sustainability, peace = social_metrics
        with social_metrics_writer.as_default():
            tf.summary.scalar('efficiency', data=efficiency, step=episode)
            tf.summary.scalar('equality', data=equality, step=episode)
            tf.summary.scalar('sustainability', data=sustainability, step=episode)
            tf.summary.scalar('peace', data=peace, step=episode)
            # Log agent accumulated reward distribution
            agent_rewards = [np.sum(rewards) for rewards in env.rewards_record.values()]
            tf.summary.histogram('accumulated_reward', agent_rewards, step=episode)

        # Make video of episode
        if episode % EPISODE_RECORD_FREQ == 0:
            utility_funcs.make_video_from_image_dir(vid_path=logdir + "/episodes",
                                                    img_folder=episode_path,
                                                    video_name="episode=%04d" % episode,
                                                    fps=10)
            # Delete images
            shutil.rmtree(episode_path, ignore_errors=True)
            # Save models Q value NN function approximators
            for agent_id in env.agents.keys():
                ddqn_models[agent_id].save_policy(path=models_path + "/%s" % agent_id)

        print("- A:%d Episode %d - DONE in: %.3f min" % (n_agents, episode, (time.time() - start_t)/60))
Exemple #5
0
#The code is mostly based on phil https://www.youtube.com/watch?v=UCgsv6tMReY&t=2207s

import envs
import numpy as np
from DDQN import DDQNAgent


if __name__ == '__main__':
    
    
    #env = envs.Env(30,40,[(1,1)],{(0,3):1,(1,3):-1})
    env = envs.ShmupEnv(12,12,5)
    ddqn_agent = DDQNAgent(alpha=0.0001,gamma=0.99,
                           n_actions=4,epsilon=1.0,epsilon_dec=1,
                           batch_size=64,input_dims=12)
    n_games = 1500
    print('---TRAINING STARTED---')
    ddqn_scores = []
    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        ddqn_agent.epsilon *=0.9
        if i%150 == 0:
            
            ddqn_agent.alpha = alphas[i]
        while not done:
        
            action = ddqn_agent.choose_action(observation)
            observation_,reward,done = env.step(action)
            observation_ = np.array(observation_)
Exemple #6
0
def test_model():
    env = gym.make('CartPole-v0')
    print('num_actions: ', env.action_space.n)
    model = DQNModel(env.action_space.n, 'DQN')

    obs = env.reset()
    print('obs_shape: ', obs.shape)

    # tensorflow 2.0: no feed_dict or tf.Session() needed at all
    best_action, q_values = model.action_value(obs[None])
    print('res of test model: ', best_action,
          q_values)  # 0 [ 0.00896799 -0.02111824]


if __name__ == '__main__':
    test_model()

    env = gym.make("CartPole-v0")
    num_actions = env.action_space.n
    model = DQNModel(num_actions, 'DQN1')
    target_model = DQNModel(num_actions, 'DQN2')
    agent = DDQNAgent(model, target_model, env)
    # test before
    rewards_sum = agent.evaluation(env)
    print("Before Training: %d out of 200" % rewards_sum)  # 10 out of 200

    agent.train()
    # test after
    rewards_sum = agent.evaluation(env)
    print("After Training: %d out of 200" % rewards_sum)  # 200 out of 200
Exemple #7
0
import gym
from DDQN import DDQNAgent
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import namedtuple, deque, OrderedDict
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np
from QNetwork import QNetwork
from experience_replay_buf import experienceReplayBuffer
import warnings
from myWrapper import StateDiscretizerWrapper

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)

er_buf = experienceReplayBuffer()
env = StateDiscretizerWrapper(gym.make('LunarLander-v2'))
theModel = QNetwork(env=env, n_hidden_nodes=64)
ddqn = DDQNAgent(env=env, network=theModel, buffer=er_buf, batch_size=64)
ddqn.train()