def gen_episode_video(models_path, map_type, n_agents, video_path): os.makedirs(video_path + "/imgs", exist_ok=True) STEPS = 1000 selected_map = MAP[map_type] AGENT_VIEW_RANGE = 5 # Create environment env = HarvestCommonsEnv(ascii_map=selected_map, num_agents=n_agents, render=True, agent_view_range=AGENT_VIEW_RANGE) # Instanciate DDQN agent models ddqn_models = {} for agent_id, agent in env.agents.items(): obs_shape = (2 * AGENT_VIEW_RANGE + 1, 2 * AGENT_VIEW_RANGE + 1, 3) model_path = os.path.join(models_path, agent_id) ddqn_models[agent_id] = DDQNAgent.from_trained_policy(path_to_model=model_path, obs_shape=obs_shape, env=env) obs = env.reset() # Convert initial observations to [0,1] float imgs for agent_id in env.agents.keys(): # Convert observation image from (Int to float) obs[agent_id] = (obs[agent_id] / 255.).astype(np.float32) for t in tqdm(range(1, STEPS), desc="Steps", position=0, leave=True): # Select agent actions to take actions = {} for agent_id, agent in env.agents.items(): # Follow policy using Q(s,a) function approximator best_action, q_values = ddqn_models[agent_id].model.action_value(tf.expand_dims(obs[agent_id], axis=0)) actions[agent_id] = ddqn_models[agent_id].get_action(best_action) # Apply agents actions on the environment next_obs, rewards, dones, info = env.step(actions) # Update current state observations obs = next_obs # Update the environment social metrics env.update_social_metrics(rewards) env.render(video_path + "/imgs/t=%04d.png" % t, title="t=%04d" % t) # Make video of episode utility_funcs.make_video_from_image_dir(vid_path=video_path, img_folder=video_path + "/imgs", video_name="learned_policy", fps=10) # Delete images shutil.rmtree(video_path + "/imgs", ignore_errors=True)
def init_ddqn_agent(input_dims: int, start_state: Tuple[List[int], bool], maze_dim: int) -> keras_sequential_model: """ Function to initialize double deep Q-learning agent. Parameters ---------- input_dims : int start_state : Tuple[List[int], bool] Returns ------- keras_sequential_model A keras sequential model object, i.e., a deep neural network """ if start_state is not None: start_s = 'fixed' else: start_s = 'random' if input_dims == 2: ddqn_agent = DDQNAgent(alpha=0.0005, gamma=0.99, n_actions=4, epsilon=1.0, batch_size=64, input_dims=input_dims, fname=f'ddqn_model_2D_{maze_dim}_{start_s}.h5') elif input_dims == 3: # init ddqn_agent in 3D with the x and y weights of the 2D agent ddqn_agent2D = keras.models.load_model(f'ddqn_model_2D_{maze_dim}_{start_s}.h5') ddqn_agent = DDQNAgent(alpha=0.0005, gamma=0.99, n_actions=6, epsilon=1.0, batch_size=64, input_dims=input_dims, fname=f'ddqn_model_3D_{maze_dim}_{start_s}.h5') # Setting the weights of agent trained in 2D equal to the untrained weights of agent in 3D ddqn_agent.q_val = set_weights_3D_2D(ddqn_agent2D, ddqn_agent.q_eval) ddqn_agent.q_target = set_weights_3D_2D(ddqn_agent2D, ddqn_agent.q_target) return ddqn_agent
# Main method if __name__ == '__main__': # Initialise env = make_env('PongNoFrameskip-v4') best_score = -np.inf load = False n_games = 500 agent = DDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001, input_dims=(env.observation_space.shape), n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, batch_size=32, replace=1000, eps_dec=1e-5, model_dir='models/', algo='DDQN', env_name='PongNoFrameSkip-v4') if load: agent.load_models() file_name = agent.algo + '_' + agent.env_name + '_lr' + str( agent.lr) + '_' + str(n_games) + 'games' plot_file = 'plots/' + file_name + '.png' n_steps = 0
def train_agents(n_agents=4, map_type="small", logs_path="logs", n_episodes=EPISODES, n_steps=STEPS, batch_size=BATCH_SIZE, lr=0.0015, gamma=0.99, epsilon=0.10, epsilon_decay=0.995, log=True): # Configure expermiment logs logdir = logs_path + "/MAP=%s-AGENTS=%d-lr=%.5f-e=%.2f-ed=%.3f-g=%.2f-b=%d" % (map_type, n_agents, lr, epsilon, epsilon_decay, gamma, batch_size) os.makedirs(logdir, exist_ok=True) # sys.stdout = open(os.path.join(logdir, "console_output.out"), "w+") social_metrics_writer = tf.summary.create_file_writer(logdir + "/social_metrics") env = HarvestCommonsEnv(ascii_map=MAP[map_type], num_agents=n_agents, render=True, agent_view_range=AGENT_VIEW_RANGE) obs = env.reset() # Instanciate DDQN agent models ddqn_models = {} for agent_id, agent in env.agents.items(): obs_shape = (2 * AGENT_VIEW_RANGE + 1, 2 * AGENT_VIEW_RANGE + 1, 3) q_net_model = DeepQNet(num_actions=env.action_space.n, input_shape=obs_shape, kernel_initializer=KERNEL_INITIALIZER) q_net_target = DeepQNet(num_actions=env.action_space.n, input_shape=obs_shape, kernel_initializer=KERNEL_INITIALIZER) ddqn_models[agent_id] = DDQNAgent(model=q_net_model, target_model=q_net_target, obs_shape=obs_shape, env=env, buffer_size=REPLAY_BUFFER_SIZE, learning_rate=lr, epsilon=epsilon, epsilon_decay=epsilon_decay, gamma=gamma, batch_size=batch_size) for episode in range(n_episodes + 1): start_t = time.time() print("- A:%d Episode %d" % (n_agents, episode)) episode_path = logdir + "/episodes/episode=%04d" % episode models_path = logdir + "/model/episode=%04d" % episode if episode % EPISODE_RECORD_FREQ == 0: os.makedirs(episode_path, exist_ok=True) obs = env.reset() # Convert initial observations to [0,1] float imgs for agent_id in env.agents.keys(): # Convert observation image from (Int to float) obs[agent_id] = (obs[agent_id] / 255.).astype(np.float32) # Reset replay buffers ddqn_models[agent_id].reset_replay_buffer() ddqn_models[agent_id].e_decay() for t in tqdm(range(1, n_steps), position=0, leave=True): # Select agent actions to take actions = {} for agent_id, agent in env.agents.items(): # Follow e greedy policy using Q(s,a) function approximator best_action, q_values = ddqn_models[agent_id].model.action_value(tf.expand_dims(obs[agent_id], axis=0)) actions[agent_id] = ddqn_models[agent_id].get_action(best_action) # Apply agents actions on the environment next_obs, rewards, dones, info = env.step(actions) # Store transition in each agent replay buffer for agent_id in env.agents.keys(): next_obs[agent_id] = (next_obs[agent_id] / 255.).astype(np.float32) ddqn_models[agent_id].store_transition(obs[agent_id], actions[agent_id], rewards[agent_id], next_obs[agent_id], dones[agent_id]) ddqn_models[agent_id].num_in_buffer = min(ddqn_models[agent_id].num_in_buffer + 1, REPLAY_BUFFER_SIZE) # When enough experience is collected, start on-line learning if t > START_LEARNING: losses = [] for agent_id in env.agents.keys(): loss = ddqn_models[agent_id].train_step() losses.append(loss) if t % TARGET_UPDATE_ITERATION == 0: # Update target model with learned changes for agent_id in env.agents.keys(): ddqn_models[agent_id].update_target_model() # with social_metrics_writer.as_default(): # tf.summary.histogram('mse_Q', data=losses, step=t + (n_steps * episode)) # Update current state observations obs = next_obs # Update the environment social metrics env.update_social_metrics(rewards) if episode % EPISODE_RECORD_FREQ == 0: env.render(episode_path + "/t=%04d.png" % t, title="t=%04d" % t) # Log metrics to tensorboard social_metrics = env.get_social_metrics(episode_steps=n_steps) efficiency, equality, sustainability, peace = social_metrics with social_metrics_writer.as_default(): tf.summary.scalar('efficiency', data=efficiency, step=episode) tf.summary.scalar('equality', data=equality, step=episode) tf.summary.scalar('sustainability', data=sustainability, step=episode) tf.summary.scalar('peace', data=peace, step=episode) # Log agent accumulated reward distribution agent_rewards = [np.sum(rewards) for rewards in env.rewards_record.values()] tf.summary.histogram('accumulated_reward', agent_rewards, step=episode) # Make video of episode if episode % EPISODE_RECORD_FREQ == 0: utility_funcs.make_video_from_image_dir(vid_path=logdir + "/episodes", img_folder=episode_path, video_name="episode=%04d" % episode, fps=10) # Delete images shutil.rmtree(episode_path, ignore_errors=True) # Save models Q value NN function approximators for agent_id in env.agents.keys(): ddqn_models[agent_id].save_policy(path=models_path + "/%s" % agent_id) print("- A:%d Episode %d - DONE in: %.3f min" % (n_agents, episode, (time.time() - start_t)/60))
#The code is mostly based on phil https://www.youtube.com/watch?v=UCgsv6tMReY&t=2207s import envs import numpy as np from DDQN import DDQNAgent if __name__ == '__main__': #env = envs.Env(30,40,[(1,1)],{(0,3):1,(1,3):-1}) env = envs.ShmupEnv(12,12,5) ddqn_agent = DDQNAgent(alpha=0.0001,gamma=0.99, n_actions=4,epsilon=1.0,epsilon_dec=1, batch_size=64,input_dims=12) n_games = 1500 print('---TRAINING STARTED---') ddqn_scores = [] for i in range(n_games): done = False score = 0 observation = env.reset() ddqn_agent.epsilon *=0.9 if i%150 == 0: ddqn_agent.alpha = alphas[i] while not done: action = ddqn_agent.choose_action(observation) observation_,reward,done = env.step(action) observation_ = np.array(observation_)
def test_model(): env = gym.make('CartPole-v0') print('num_actions: ', env.action_space.n) model = DQNModel(env.action_space.n, 'DQN') obs = env.reset() print('obs_shape: ', obs.shape) # tensorflow 2.0: no feed_dict or tf.Session() needed at all best_action, q_values = model.action_value(obs[None]) print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] if __name__ == '__main__': test_model() env = gym.make("CartPole-v0") num_actions = env.action_space.n model = DQNModel(num_actions, 'DQN1') target_model = DQNModel(num_actions, 'DQN2') agent = DDQNAgent(model, target_model, env) # test before rewards_sum = agent.evaluation(env) print("Before Training: %d out of 200" % rewards_sum) # 10 out of 200 agent.train() # test after rewards_sum = agent.evaluation(env) print("After Training: %d out of 200" % rewards_sum) # 200 out of 200
import gym from DDQN import DDQNAgent import torch from torch.autograd import Variable import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from collections import namedtuple, deque, OrderedDict device = torch.device("cuda" if torch.cuda.is_available() else "cpu") import numpy as np from QNetwork import QNetwork from experience_replay_buf import experienceReplayBuffer import warnings from myWrapper import StateDiscretizerWrapper with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) er_buf = experienceReplayBuffer() env = StateDiscretizerWrapper(gym.make('LunarLander-v2')) theModel = QNetwork(env=env, n_hidden_nodes=64) ddqn = DDQNAgent(env=env, network=theModel, buffer=er_buf, batch_size=64) ddqn.train()