class MADDPGAgent(): def __init__(self, seed, checkpoint_filename=None): self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, DEVICE, seed) self.t = 0 self.agents = [ DDPGAgent(index, NUM_AGENTS, seed, DEVICE) for index in range(NUM_AGENTS) ] if checkpoint_filename: for i, to_load in enumerate(self.agents): f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights" actor_file = torch.load( f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights", map_location=DEVICE) critic_file = torch.load( f"{os.getcwd()}/models/{checkpoint_filename}_critic_{i}.weights", map_location=DEVICE) to_load.actor_local.load_state_dict(actor_file) to_load.actor_target.load_state_dict(actor_file) to_load.critic_local.load_state_dict(critic_file) to_load.critic_target.load_state_dict(critic_file) print(f'Files loaded with prefix {checkpoint_filename}') def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.t = (self.t + 1) % UPDATE_FREQUENCY if self.t == 0 and (len(self.memory) > BATCH_SIZE): experiences = [self.memory.sample() for _ in range(NUM_AGENTS)] self.learn(experiences, GAMMA) def act(self, all_states, random): all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, random=random) all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): all_actions = [] all_next_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(DEVICE) state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) all_actions.append(agent.actor_local(state)) all_next_actions.append(agent.actor_target(next_state)) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions)
class MADDPG(): def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize multiple Agents each with a Actor-Critic network but they share the replay buffer to learn from experience """ self.num_agents = num_agents self.agents = [] for _ in range(num_agents): agent = Agent(state_size, action_size, random_seed) self.agents.append(agent) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, states, add_noise=True): clipped_actions = [] for state, agent in zip(states, self.agents): clipped_actions.append(agent.act(state, add_noise)) return clipped_actions def reset(self): for agent in self.agents: agent.reset() def learn(self, experiences, gamma): for agent in self.agents: agent.learn(experiences, gamma) def saveCheckPoints(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoints/actor_agent_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoints/critic_agent_{i}.pth") def loadCheckPoints(self): for i, agent in enumerate(self.agents): agent.actor_local.load_state_dict( torch.load(f"checkpoints/actor_agent_{i}.pth")) agent.critic_local.load_state_dict( torch.load(f"checkpoints/critic_agent_{i}.pth")) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for agent in self.agents: experiences = self.memory.sample() self.learn(experiences, GAMMA)
class MADDPG: def __init__(self, num_agents=2, random_seed=1): #np.random.randint(1000) super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed), DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed) ] self.num_agents = num_agents # Replay memory action_size = 2 self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, obs_all_agents, noise_ampl=1): """get actions from all agents in the MADDPG object""" actions = [ agent.act(obs, noise_ampl) for agent, obs in zip(self.maddpg_agent, obs_all_agents) ] return actions def add_memory(self, state, action, reward, next_state, done): # Save experience / reward self.memory.num_agents = self.num_agents self.memory.add(state, action, reward, next_state, done) def step(self): """Save experience in replay memory, and use random sample from buffer to learn.""" # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for n in range(0, self.num_agents): experiences = self.memory.sample() self.maddpg_agent[n].step(experiences) def reset(self): for n in range(0, self.num_agents): self.maddpg_agent[n].reset()
class MultiAgent: """Interacts with and learns from the environment.""" def __init__(self, agent_count, state_size, action_size, random_seed): """Initialize a MultiAgent object. Params ====== agent_count (int): Number of agents """ self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.agents = [ Agent( memory=self.memory, state_size=state_size, action_size=action_size, random_seed=random_seed, ) for _ in range(agent_count) ] def step(self, states, actions, rewards, next_states, dones, timestep): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0: for agent in self.agents: agent.learn(self.memory.sample(), GAMMA) def act(self, all_states): """Get actions from all agents""" actions = [ agent.act(np.expand_dims(states, axis=0)) for agent, states in zip(self.agents, all_states) ] return actions def reset(self): for agent in self.agents: agent.reset()
def update(self, buffer: ReplayBuffer, batchsize: int = 1000, tau: float = 0.005, discount: float = 0.98): states, actions, rewards, states_next, dones = buffer.sample( batchsize=batchsize) actions_next = self.target_actor(torch.stack(states_next).float()) input_target_critic = torch.cat( [torch.stack(states_next).float(), actions_next.float()], axis=1) state_value = self.target_critic(input_target_critic) state_value.add_(torch.tensor(rewards).unsqueeze(1)) state_value = state_value * discount * (1 - torch.tensor(dones).float()) state_value.detach() input_critic = torch.cat( [torch.stack(states).float(), torch.stack(actions).float()], axis=1) state_value_local = self.critic(input_critic) critic_loss = (state_value - state_value_local).pow(2).mul(0.5).sum(-1).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor actions_new = self.actor(torch.stack(states).float()) value_critic = self.critic( torch.cat([torch.stack(states).float(), actions_new], axis=1)) loss_actor = -value_critic.mean() self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() soft_update(self.target_actor, self.actor, tau) soft_update(self.target_critic, self.critic, tau)
class MultiAgent: def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPGAgent(state_size, action_size, random_seed) for _ in range(num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed) self.t_step = 0 def step_all(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for agent in self.agents: experiences = self.memory.sample() agent.learn(experiences, GAMMA) def act_all(self, multi_states): actions = [ agent.act(np.expand_dims(states, axis=0)) for agent, states in zip(self.agents, multi_states) ] return actions def save_weights_all(self): for index, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), 'agent{}_checkpoint_actor.pth'.format(index + 1)) torch.save(agent.critic_local.state_dict(), 'agent{}_checkpoint_critic.pth'.format(index + 1)) def reset_all(self): for agent in self.agents: agent.reset()
class MultiAgent: def __init__(self, config): self.random_seeds = config['random_seeds'] self.params = config['params'] self.memory = ReplayBuffer(self.params['action_size'], self.params['buffer_size'], self.params['batch_size'], device, self.random_seeds[0]) self.params['memory'] = self.memory self.ddpg_agents = [ Agent(self.params, self.random_seeds[i]) for i in range(2) ] self.t_step = 0 def act(self, states): actions = [ agent.act(np.expand_dims(state, axis=0)) for agent, state in zip(self.ddpg_agents, states) ] #actions = [agent.act(states) for agent in self.ddpg_agents] return actions def step(self, states, actions, rewards, next_states, dones): self.t_step += 1 for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) if (len(self.memory) > self.params['batch_size']) and ( self.t_step % self.params['num_steps_per_update'] == 0): for agent in self.ddpg_agents: experiences = self.memory.sample() agent.learn(experiences, self.params['gamma']) def reset(self): for agent in self.ddpg_agents: agent.reset()
class DQNAgent: def __init__(self, env, state_size, action_size, batch_size, gamma, lr, update_every, tau, eps_start, eps_end, eps_decay, seed): for key, value in locals().items(): if key != 'self': setattr(self, key, value) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) self.Q_target = LinearModel(state_size, action_size) self.Q_local = LinearModel(state_size, action_size) self.memory = ReplayBuffer(batch_size=batch_size) self.optim = torch.optim.Adam(self.Q_local.parameters(), lr=lr) self.update_counter = 0 def env_reset(self, train_mode=True): return self.env.reset() def env_step(self, action): return self.env.step(action) def env_render(self, train_mode=False): return self.env.render() def env_close(self, train_mode=True): if not train_mode: return self.env.close() def get_action(self, state, epsilon=0.): if random.random() < epsilon: return np.random.choice(np.arange(self.action_size)) state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) self.Q_local.eval() with torch.no_grad(): action = np.argmax(self.Q_local(state).data.numpy()) return action def step(self, state, action, reward, next_state, done): self.memory.store( (state, action, reward, next_state, 1 if done else 0)) self.update_counter = (self.update_counter + 1) % self.update_every if self.update_counter == 0: self.update_Q() def update_Q(self): states, actions, rewards, next_states, dones = self.memory.sample() Q_target_next = self.Q_target(next_states).detach().max( dim=1, keepdim=True)[0] Q_target_pred = rewards + self.gamma * Q_target_next * (1.0 - dones) self.Q_local.eval() Q = self.Q_local(states).gather(1, actions) loss = F.mse_loss(Q, Q_target_pred) self.Q_local.train() self.Q_local.zero_grad() loss.backward() self.optim.step() for t_param, l_param in zip(self.Q_target.parameters(), self.Q_local.parameters()): t_param.data.copy_(self.tau * l_param.data + (1.0 - self.tau) * t_param.data) def train(self, num_episodes, max_t=1000, is_finished=None, render=False): scores = [] eps = self.eps_start for i in range(num_episodes): state = self.env_reset(train_mode=True) score = 0 for _ in range(max_t): action = self.get_action(state, eps) if render: self.env_render(train_mode=True) next_state, reward, done, _ = self.env_step(action) self.step(state, action, reward, next_state, done) score += reward state = next_state if done: break eps = max(self.eps_end, eps * self.eps_decay) scores.append(score) if is_finished and is_finished(scores, num_episodes): break if render: self.env_close(train_mode=False) return scores def run(self, num_episodes=1, max_t=1000, render=None): if render == None: render = num_episodes == 1 scores = [] for i in range(num_episodes): state = self.env_reset(train_mode=False) score = 0 for _ in range(max_t): action = self.get_action(state) if render: self.env_render(train_mode=False) next_state, reward, done, _ = self.env_step(action) score += reward state = next_state if done: break scores.append(score) if render: self.env_close(train_mode=False) return scores
class DDQNAgent: def __init__(self, config: Config, training=True): self.config = config self.is_training = training self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_shape, self.config.action_dim) self.target_model = DQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.optim = Adam(self.model.parameters(), lr=self.config.learning_rate) self.model.cuda() self.target_model.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learn(self, t): s, a, r, s2, done = self.buffer.sample(self.config.batch_size) s = torch.tensor(s, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) s2 = torch.tensor(s2, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) s = s.cuda() a = a.cuda() r = r.cuda() s2 = s2.cuda() done = done.cuda() q_values = self.model(s).cuda() next_q_values = self.model(s2).cuda() next_q_state_values = self.target_model(s2).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optim.zero_grad() loss.backward() self.optim.step() if t % self.config.update_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_checkpoint(self): os.makedirs('ckpt', exist_ok=True) torch.save(self.model.state_dict(), 'ckpt/model.pt') def load_checkpoint(self): self.model.load_state_dict('ckpt/model.pt') self.target_model.load_state_dict('ckpt/model.pt')
class MADDPG_Trainer: def __init__(self, n_agents, act_spcs, ob_spcs, writer, args): self.args = args self.memory = ReplayBuffer(args.buffer_length, n_agents, device) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_agents = n_agents self.act_spcs = act_spcs self.ob_spcs = ob_spcs self.agents = [ DDPG_agent(self.act_spcs[i], self.ob_spcs[i], np.sum(self.ob_spcs), np.sum(self.act_spcs)) for i in range(n_agents) ] self.n_steps = 0 self.n_updates = 0 self.writer = writer self.criterion = nn.MSELoss() def get_actions(self, states): return [ agent.select_action(state)[0] for agent, state in zip(self.agents, states) ] def store_transitions(self, states, actions, rewards, next_states, dones): self.memory.add(states, actions, rewards, next_states, dones) def reset(self): pass def transform_states(self, states, N): obses = [] for i in range(N): states_ = [] for j in range(self.n_agents): states_.append(states[j][i]) obses.append(torch.cat([f.float().to(device) for f in states_])) return torch.stack(obses) def transform_actions(self, actions, N): acts = [] for i in range(N): actions_ = [] for j in range(self.n_agents): actions_.append(actions[j][i]) acts.append(torch.cat([f.float().to(device) for f in actions_])) return torch.stack(acts) def update_all_targets(self): for agent in self.agents: soft_update(agent.policy_targ, agent.policy, TAU) soft_update(agent.qnet_targ, agent.qnet, TAU) def prep_training(self): for agent in self.agents: agent.qnet.train() agent.policy.train() agent.qnet_targ.train() agent.policy_targ.train() def eval(self): for agent in self.agents: agent.qnet.eval() agent.policy.eval() agent.qnet_targ.eval() agent.policy_targ.eval() def sample_and_train(self, batch_size): # TODO ADD Model saving, optimize code batch = self.memory.sample(min(batch_size, len(self.memory))) states_i, actions_i, rewards_i, next_states_i, dones_i = batch states_all = torch.cat(states_i, 1) next_states_all = torch.cat(next_states_i, 1) actions_all = torch.cat(actions_i, 1) for i, agent in enumerate(self.agents): next_actions_all = [ onehot_from_logits(ag.policy_targ(next_state)) for ag, next_state in zip(self.agents, next_states_i) ] # computing target total_obs = torch.cat( [next_states_all, torch.cat(next_actions_all, 1)], 1) target_q = self.agents[i].qnet_targ(total_obs).detach() rewards = rewards_i[i].view(-1, 1) dones = dones_i[i].view(-1, 1) target_q = rewards + (1 - dones) * GAMMA * target_q # computing the inputs input_q = self.agents[i].qnet( torch.cat([states_all, actions_all], 1)) self.agents[i].q_optimizer.zero_grad() loss = self.criterion(input_q, target_q.detach()) # print("LOSS", loss) loss.backward() torch.nn.utils.clip_grad_norm_(self.agents[i].qnet.parameters(), 0.5) self.agents[i].q_optimizer.step() actor_loss = 0 # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø # use gumbel softmax max temp trick policy_out = self.agents[i].policy(states_i[i]) gumbel_sample = gumbel_softmax(policy_out, hard=True) actions_curr_pols = [ onehot_from_logits(agent_.policy(state)) for agent_, state in zip(self.agents, states_i) ] for action_batch in actions_curr_pols: action_batch.detach_() actions_curr_pols[i] = gumbel_sample actor_loss = -self.agents[i].qnet( torch.cat( [states_all.detach(), torch.cat(actions_curr_pols, 1)], 1)).mean() actor_loss += (policy_out**2).mean() * 1e-3 self.agents[i].p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), 5) torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5) self.agents[i].p_optimizer.step() # detach the forward propagated action samples actions_i[i].detach_() if self.args.use_writer: self.writer.add_scalars("Agent_%i" % i, { "vf_loss": loss, "actor_loss": actor_loss }, self.n_updates) self.update_all_targets() self.n_updates += 1
class MADDPG(): """Agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, action_size=2, n_agents=2, seed=0): """ Params ====== action_size (int): dimension of each action seed (int): Random seed n_agents (int): number of agents """ self.n_agents = n_agents self.t_step = 0 self.noise_on = True # create two agents, each with their own actor and critic models = [ model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents) ] self.agents = [DDPG(i, models[i]) for i in range(n_agents)] # create shared replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.t_step = self.t_step + 1 if self.t_step % UPDATE_EVERY == 0: if len(self.memory) > BATCH_SIZE: experiences = [ self.memory.sample() for _ in range(self.n_agents) ] self.learn(experiences, GAMMA) def act(self, all_states, add_noise=True): # pass each agent's state from the environment and calculate its action all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, add_noise=self.noise_on) #self.noise_weight *= noise_decay all_actions.append(action) return np.array(all_actions).reshape( 1, -1) # reshape 2x2 into 1x4 dim vector def learn(self, experiences, gamma): all_next_actions = [] all_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) # extract agent i's state and get action via actor network state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # extract agent i's next state and get action via target actor network next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions) def save_agents(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoint_actor_agent_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoint_critic_agent_{i}.pth")
noise = noise if hard_noise_reigime else noise * NOISE_DECAY # END EPISODE IF ANY AGENT IS DONE if any(dones): break if episode_i > HARD_NOISE_STEPS: hard_noise_reigime = False # POTENTIALLY START TAKING SAMPLES TO TRAIN FROM EXPERIENCE BUFFER if len(buffer) > MIN_BUFFER_SIZE: update_flag = "u" for _ in range(N_BATCHES_PER_UPDATE): for agent_i in range(N_AGENTS): # samples = buffer.sample(3) samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, agent_i) if UPDATE_TARGET_AFTER_EACH_BATCH: maddpg.update_targets() if not UPDATE_TARGET_AFTER_EACH_BATCH: maddpg.update_targets() else: update_flag = " " # UPDATE EPISODE AND ROLLING MEAN SCORES agg_reward_this_episode = np.max(rewards_this_episode) rewards_deque.append(agg_reward_this_episode) rolling_mean_reward = np.mean(rewards_deque) history.append(agg_reward_this_episode) history_rolling_mean.append(rolling_mean_reward)
class MADDPG(object): """ The main class that defines and trains all the DDPG agents. """ def __init__( self, num_agents, state_size, action_size, buffer_size=int(1e6), batch_size=128, writer=None, actor_hidden_sizes=(256, 128), actor_lr=1e-4, actor_weight_decay=0., critic_hidden_sizes=(256, 128), critic_lr=1e-3, critic_weight_decay=0., model_folder_path=None, ): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.full_state_size = num_agents * state_size self.full_action_size = num_agents * action_size # Replay memory self.memory = ReplayBuffer(buffer_size) # TensorboardX Writer self.writer = writer # Actor Network Parameters self.actor_hidden_sizes = actor_hidden_sizes self.actor_lr = actor_lr self.actor_weight_decay = actor_weight_decay # Critic Network Parameters self.critic_hidden_sizes = critic_hidden_sizes self.critic_lr = critic_lr self.critic_weight_decay = critic_weight_decay # Model Folder self.folder_path = Path() if model_folder_path is None else Path( model_folder_path) # MADDPG Agents self.agents = [] self._init_agents() def reset(self): for agent in self.agents: agent.reset() def act(self, states, noise=0.): return [ agent.act(obs, noise) for agent, obs in zip(self.agents, states) ] def step(self, i_episode, states, actions, rewards, next_states, dones, tau=0.01, num_learns=1): # save to replay buffer self.memory.add(states, actions, rewards, next_states, dones) # train the model if len(self.memory) >= self.batch_size and num_learns > 0: actor_loss_list, critic_loss_list = [], [] for _ in range(num_learns): # learn multiple times at every step states, actions, rewards, next_states, dones = self.memory.sample( self.batch_size) for agent_id in range(self.num_agents): # Learn one time for the agents actor_loss, critic_loss = self._learn( agent_id, states, actions, next_states, rewards, dones) actor_loss_list.append(actor_loss) critic_loss_list.append(critic_loss) # Record Losses for actor & critic if self.writer: for agent_id in range(self.num_agents): self.writer.add_scalars( f'agent{agent_id}/losses', { 'critic loss': np.mean(critic_loss_list), 'actor_loss': np.mean(actor_loss_list) }, i_episode) # Soft update self._update_all(tau) def save(self): for agent in self.agents: torch.save( agent.actor_local.state_dict(), self.folder_path / f'checkpoint_actor_local_{agent.id}.pth') torch.save( agent.critic_local.state_dict(), self.folder_path / f'checkpoint_critic_local_{agent.id}.pth') def load(self, agent_id=None): for agent in self.agents: agent_id_ = agent.id if agent_id is None else agent_id agent.actor_local.load_state_dict( torch.load(self.folder_path / f'checkpoint_actor_local_{agent_id_}.pth')) agent.critic_local.load_state_dict( torch.load(self.folder_path / f'checkpoint_critic_local_{agent_id_}.pth')) def _init_agents(self): for i in range(self.num_agents): agent = DDPG(i, self.state_size, self.full_state_size, self.action_size, self.full_action_size, self.actor_hidden_sizes, self.actor_lr, self.actor_weight_decay, self.critic_hidden_sizes, self.critic_lr, self.critic_weight_decay) self.agents.append(agent) def _learn(self, agent_id, states, actions, next_states, rewards, dones): critic_full_actions, critic_full_next_actions = [], [] for agent in self.agents: # current actions actor_actions = agent.actor_local(states[:, agent.id, :]) critic_full_actions.append(actor_actions) # next actions actor_next_actions = agent.actor_target.forward( next_states[:, agent.id, :]) critic_full_next_actions.append(actor_next_actions) # learn for the agent current_agent = self.agents[agent_id] actor_loss, critic_loss = current_agent.learn( states, actions, rewards, next_states, dones, critic_full_actions, critic_full_next_actions) return actor_loss, critic_loss def _update_all(self, tau): for agent in self.agents: agent.update(agent.actor_local, agent.actor_target, tau) agent.update(agent.critic_local, agent.critic_target, tau)
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
class SoftActorCriticAgent(): def __init__(self): torch.autograd.set_detect_anomaly(True) self.conv_net = ConvNetwork() self.critic_v = StateValueNetwork() self.critic_v_target = StateValueNetwork() self.critic_q_1 = ActionValueNetwork() self.critic_q_2 = ActionValueNetwork() self.actor = PolicyNetwork() self.actor_optim = optim.Adam(self.actor.parameters(), lr=3*10e-4) #0.003 self.v_optim = optim.Adam(self.critic_v.parameters(), lr=0.003) self.q1_optim = optim.Adam(self.critic_q_1.parameters(), lr=0.003) self.q2_optim = optim.Adam(self.critic_q_2.parameters(), lr=0.003) self.gamma = 0.99 self.tau = 0.005 self.batch_size = 16 #256 self.reward_scale = 10 self.replay_buffer = ReplayBuffer(self.batch_size) self.update_target(1) def select_actions(self, state): self.actor.eval() self.conv_net.eval() with torch.no_grad(): state = self.conv_net(state.unsqueeze(0)) mean, log_variance = self.actor.forward(state) variance = log_variance.exp() gaussian = Normal(mean, variance) z = gaussian.sample() actions = torch.tanh(z) actions = actions.cpu().detach().squeeze(0) dim1 = actions[0:3] dim1_p = F.softmax(dim1, 0) action1 = torch.argmax(dim1_p) dim2 = actions[3:6] dim2_p = F.softmax(dim2, 0) action2 = torch.argmax(dim2_p) dim3 = actions[6:8] dim3_p = F.softmax(dim3, 0) action3 = torch.argmax(dim3_p) dim4 = actions[8:11] dim4_p = F.softmax(dim4, 0) action4 = torch.argmax(dim4_p) actions_env_format = [action1.item(), action2.item(), action3.item(), action4.item()] self.actor.train() self.conv_net.train() return actions, numpy.array(actions_env_format) def train(self): if(len(self.replay_buffer.replay_buffer) < self.batch_size): return states, actions, rewards, next_states, dones = self.replay_buffer.sample() states = self.conv_net(states).detach() next_states = self.conv_net(next_states).detach() current_q_1 = self.critic_q_1(states, actions) current_q_2 = self.critic_q_2(states, actions) current_critic_v = self.critic_v(states) mean, variance, z, log_pi = self.actor.sample(states) policy_actions = torch.tanh(z) # r(st,at) +γEst+1∼p[V ̄ψ(st+1)], target_q = rewards * self.reward_scale + (self.gamma * self.critic_v_target(next_states) * (1-dones)) q1_loss = F.mse_loss(current_q_1, target_q.detach()) q2_loss = F.mse_loss(current_q_2, target_q.detach()) self.q1_optim.zero_grad() q1_loss.backward() self.q1_optim.step() self.q2_optim.zero_grad() q2_loss.backward() self.q2_optim.step() q1 = self.critic_q_1(states, policy_actions) q2 = self.critic_q_2(states, policy_actions) predicted_new_q = torch.min(q1, q2) # Eat∼πφ[Qθ(st,at)−logπφ(at|st)] target_critic_v = predicted_new_q - log_pi critic_loss = F.mse_loss(current_critic_v, target_critic_v.detach()) self.v_optim.zero_grad() critic_loss.backward() self.v_optim.step() actor_loss = (log_pi - predicted_new_q).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.update_target(self.tau) def update_target(self, tau): for target_param, param in zip(self.critic_v_target.parameters(), self.critic_v.parameters()): target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
class MADDPG: def __init__(self, num_agents, state_size, action_size, hidden_layers, seed, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize MADDPG agent.""" super(MADDPG, self).__init__() self.seed = random.seed(seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.buffer_size = buffer_size self.batch_size = batch_size self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \ tau, lr_actor, lr_critic, weight_decay, seed) \ for _ in range(num_agents)] self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size) def act(self, states): actions = np.zeros([self.num_agents, self.action_size]) for index, agent in enumerate(self.agents): actions[index, :] = agent.act(states[index]) return actions def step(self, states, actions, rewards, next_states, dones): """One step for MADDPG agent, include store the current transition and update parameters.""" self.replay_buffer.add(states, actions, rewards, next_states, dones) if len(self.replay_buffer) > self.batch_size: ''' experiences = self.replay_buffer.sample() states_list, _, _, _, _ = experiences next_actions_list = [self.agents[idx].target_actor(states).detach() \ for idx, states in enumerate(states_list)] for i in range(self.num_agents): self.agents[i].step_learn(experiences, next_actions_list, i) ''' for agent in self.agents: experiences = self.replay_buffer.sample() agent.step_learn(experiences) def save_weights(self): for index, agent in enumerate(self.agents): torch.save( agent.critic.state_dict(), 'agent{}_critic_trained_with_DDPG.pth'.format(index + 1)) torch.save(agent.actor.state_dict(), 'agent{}_actor_trained_with_DDPG.pth'.format(index + 1)) def reset(self): for agent in self.agents: agent.reset()
replay_buffer.add(obs[0], action, rew, new_obs[0], float(done)) obs = new_obs episode_rewards[-1] += rew if done: episode_end = t duration.append(episode_end - episode_start) episode_start = t obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_indxes = np.ones_like(rewards), None obses_t, obses_tp1 = tf.constant(obses_t), tf.constant(obses_tp1) actions, rewards, dones = tf.constant( actions, dtype=tf.int64), tf.constant(rewards), tf.constant(dones) weights = tf.constant(weights) td_errors = agent.train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. agent.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
class MADDPG(): def __init__(self, num_agents, state_size, action_size, random_seed): super(MADDPG, self).__init__() self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.random_seed = random_seed self.maddpg_agent = [ Agent(self.state_size, self.action_size, self.num_agents * self.state_size, self.num_agents * self.action_size, self.random_seed) for i in range(self.num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise_amplitud = 1 self.noise_reduction = 0.9995 self.t_step = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step += 1 if len(self.memory) > BATCH_SIZE and self.t_step % UPDATE_EVERY == 0: # Learn, if enough samples are available in memory for _ in range(round(UPDATE_AMOUNT)): for agent in range(self.num_agents): experiences = self.memory.sample() self.learn(experiences, agent, GAMMA) self.update_targets() def act(self, states): """get actions from all agents in the MADDPG object""" if self.t_step < NOISE_START: noise_amplitud = 0 else: noise_amplitud = self.noise_amplitud self.noise_amplitud = max( self.noise_amplitud * self.noise_reduction, 0.1) actions = np.array([ agent.act(state, noise_amplitud) for agent, state in zip(self.maddpg_agent, states) ]) return actions def target_actors(self, states): target_actions = torch.cat([ agent.actor_target(states[:, i, :]) for i, agent in enumerate(self.maddpg_agent) ], dim=1) return target_actions def actors(self, states): actions = torch.cat([ agent.actor(states[:, i, :]) for i, agent in enumerate(self.maddpg_agent) ], dim=1) return actions def learn(self, experiences, agent_number, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences agent = self.maddpg_agent[agent_number] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models target_actions_full = self.target_actors(next_states) next_states_full = next_states.view(-1, self.num_agents * self.state_size) # target_critic_input = torch.cat((next_states_full,target_actions_full), dim = 1) Q_targets_next = agent.critic_target(next_states_full, target_actions_full) # Compute Q targets for current states (y_i) Q_targets = rewards[:, agent_number].view( -1, 1) + (gamma * Q_targets_next * (1 - dones[:, agent_number].view(-1, 1))) # Compute critic loss actions_full = actions.view(-1, self.action_size * self.num_agents) states_full = states.view(-1, self.num_agents * self.state_size) # critic_input = torch.cat((states_full,actions_full), dim = 1) Q_expected = agent.critic(states_full, actions_full) critic_loss = F.mse_loss(Q_expected, Q_targets) # critic_loss = huber_loss(Q_expected, Q_targets.detach()) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1) agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_full_pred = self.actors(states) # critic_input_loss = torch.cat((states_batch, actions_full), dim = 1) actor_loss = -agent.critic(states_full, actions_full_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1) agent.actor_optimizer.step() def update_targets(self): """soft update target networks""" for agent in self.maddpg_agent: self.soft_update(agent.actor, agent.actor_target, TAU) self.soft_update(agent.critic, agent.critic_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): for ddpg_agent in self.maddpg_agent: ddpg_agent.noise.reset()
def run(config): data_folder = Path(config.data_path) building_attributes = data_folder / 'building_attributes.json' solar_profile = data_folder / 'solar_generation_1kW.csv' building_state_actions = 'buildings_state_action_space.json' # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)] config.num_buildings = 6 # customized log directory hidden = config.hidden_dim lr = config.lr tau = config.tau gamma = config.gamma batch_size = config.batch_size buffer_length = config.buffer_length to_print = lambda x: str(x) log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\ "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/" logger = SummaryWriter(log_dir=log_path) # TODO fix here building_ids = ["Building_" + str(i) for i in [1, 2, 5, 6, 7, 8]] #[1,2,5,6,7,8] env = CityLearn(building_attributes, solar_profile, building_ids, buildings_states_actions=building_state_actions, cost_function=[ 'ramping', '1-load_factor', 'peak_to_valley_ratio', 'peak_demand', 'net_electricity_consumption' ]) observations_spaces, actions_spaces = env.get_state_action_spaces() # Instantiating the control agent(s) if config.agent_alg == 'MADDPG': agents = MA_DDPG(observations_spaces, actions_spaces, hyper_params=vars(config)) else: raise NotImplementedError k, c = 0, 0 cost, cum_reward = {}, {} buffer = ReplayBuffer(max_steps=config.buffer_length, num_agents=config.num_buildings, obs_dims=[s.shape[0] for s in observations_spaces], ac_dims=[a.shape[0] for a in actions_spaces]) # TODO: store np or tensor in buffer? start = time.time() for e in range(config.n_episodes): cum_reward[e] = 0 rewards = [] state = env.reset() statecast = lambda x: [torch.FloatTensor(s) for s in x] done = False ss = 0 while not done: if k % (40000 * 4) == 0: print('hour: ' + str(k) + ' of ' + str(TIME_PERIOD * config.n_episodes)) action = agents.select_action(statecast(state), explore=False) action = [a.detach().numpy() for a in action] # if batch norm: action = [np.squeeze(a, axis=0) for a in action] ss += 1 #print("action is ", action) #print(action[0].shape) #raise NotImplementedError next_state, reward, done, _ = env.step(action) reward = reward_function( reward) # See comments in reward_function.py #buffer_reward = [-r for r in reward] # agents.add_to_buffer() buffer.push(statecast(state), action, reward, statecast(next_state), done) # if (len(buffer) >= config.batch_size and # (e % config.steps_per_update) < config.n_rollout_threads): if len(buffer) >= config.batch_size: if USE_CUDA: agents.to_train(device='gpu') else: agents.to_train(device='cpu') for a_i in range(agents.n_buildings): sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA) agents.update(sample, a_i, logger=logger, global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='net electric consumption', scalar_value=env.net_electric_consumption[-1], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='env cost total', scalar_value=env.cost()['total'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="1 load factor", scalar_value=env.cost()['1-load_factor'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak to valley ratio", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak demand", scalar_value=env.cost()['peak_demand'], global_step=e * TIME_PERIOD + ss) logger.add_scalar( tag="net energy consumption", scalar_value=env.cost()['net_electricity_consumption'], global_step=e * TIME_PERIOD + ss) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e * TIME_PERIOD + ss) for id, r in enumerate(reward): logger.add_scalar(tag="agent {} reward ".format(id), scalar_value=r, global_step=e * TIME_PERIOD + ss) state = next_state cum_reward[e] += reward[0] k += 1 cur_time = time.time() # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k)) cost[e] = env.cost() if c % 1 == 0: print(cost[e]) # add env total cost and reward logger logger.add_scalar(tag='env cost total final', scalar_value=env.cost()['total'], global_step=e) logger.add_scalar(tag="1 load factor final", scalar_value=env.cost()['1-load_factor'], global_step=e) logger.add_scalar(tag="peak to valley ratio final", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e) logger.add_scalar(tag="peak demand final", scalar_value=env.cost()['peak_demand'], global_step=e) logger.add_scalar( tag="net energy consumption final", scalar_value=env.cost()['net_electricity_consumption'], global_step=e) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e) c += 1 rewards.append(reward) end = time.time() print((end - start) / 60.0)
def main(): env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] seeding() # number of parallel agents #parallel_envs = num_agents # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 update_actor_after = 100 update_actor_every = 2 episode_length = 100 batchsize = 100 # how many episodes to save policy and gif save_interval = 1000 t = 0 LR_ACTOR = 1e-5 LR_CRITIC = 3e-3 # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.999999 # how many episodes before update episode_per_update = 1 no_of_updates_perTime = 1 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) #torch.set_num_threads(parallel_envs) #env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(10 * episode_length)) # initialize policy and critic maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC) #logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] #agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes): timer.update(episode) env_info = env.reset( train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) reward_this_episode = np.zeros((1, num_agents)) #all_obs = env.reset() # obs = states obs_full = np.concatenate((states[0], states[1])) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < 1 or episode == number_of_episodes - 1) tmax = 0 #resetting noise for i in range(num_agents): maddpg.maddpg_agent[i].noise.reset() for episode_t in range(episode_length): t += 1 update_act = True if (episode > update_actor_after or episode % update_actor_every == 0) else False # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensorAsitis(obs), noise=noise, batch=False) noise *= noise_reduction actions_array = torch.stack(actions).cpu().detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame env_info = env.step(actions_for_env)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards rewards_for_env = np.hstack(rewards) obs = states obs_full = np.concatenate((states[0], states[1])) next_obs = next_states next_obs_full = np.concatenate((next_states[0], next_states[1])) # add data to buffer transition = (np.array([obs]), np.array([obs_full]), np.array([actions_for_env]), np.array([rewards_for_env]), np.array([next_obs]), np.array([next_obs_full]), np.array([dones], dtype='float')) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for _ in range(no_of_updates_perTime): for a_i in range(num_agents): samples = buffer.sample(batchsize) #updating the weights of the n/w maddpg.update(samples, a_i, update_actor=update_act) maddpg.update_targets( ) #soft update the target network towards the actual networks if np.any(dones): # if the episode is done the loop is break to the next episode break for i in range(num_agents): agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)] agent0_reward = [] agent1_reward = [] for a_i, avg_rew in enumerate(avg_rewards): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), #frames, duration=.04) timer.finish()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory=None, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if memory is not None: self.memory = memory else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() if add_noise: action += self.noise.sample() self.actor_local.train() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class MADDPGAgent: """Interacts and learns from the environment using multiple DDPG agents""" def __init__(self): """Initialize a MADDPG Agent object.""" super(MADDPGAgent, self).__init__() self.config = Config.getInstance() self.action_num = self.config.action_size * self.config.num_agents self.t_step = 0 self.maddpg_agent = [ DDPGAgent() for _ in range(self.config.num_agents) ] self.memory = ReplayBuffer() def get_actors(self): """get actors of all the agents in the MADDPG object""" actors = [ddpg_agent.actor for ddpg_agent in self.maddpg_agent] return actors # def get_target_actors(self): # """get target_actors of all the agents in the MADDPG object""" # target_actors = [ # ddpg_agent.target_actor for ddpg_agent in self.maddpg_agent] # return target_actors def act(self, obs_all_agents, noise=0.0): """get actions from all agents in the MADDPG object""" actions = [ agent.act(obs, noise) for agent, obs in zip(self.maddpg_agent, obs_all_agents) ] return np.concatenate(actions) def update_act(self, obs_all_agents, agent_num, noise_decay_parameter=0.0): """ get target network actions from all the agents in the MADDPG object """ actions_ = [] for a_i, ddpg_agent in enumerate(self.maddpg_agent): obs = obs_all_agents[:, a_i, :].to(self.config.device) acn = ddpg_agent.actor( obs) + noise_decay_parameter * ddpg_agent.noise.sample() if a_i != agent_num: acn = acn.detach() actions_.append(acn) return actions_ def target_act(self, obs_all_agents, noise=0.0): """ get target network actions from all the agents in the MADDPG object """ target_actions = [ ddpg_agent.target_act(obs_all_agents[:, a_i, :], noise) for a_i, ddpg_agent in enumerate(self.maddpg_agent) ] return target_actions def step(self, _states, _actions, _rewards, _next_states, _dones): """Save experience in replay memory, and use random sample from buffer to learn.""" states_full = np.reshape(_states, newshape=(-1)) next_states_full = np.reshape(_next_states, newshape=(-1)) self.memory.add(_states, states_full, _actions, _rewards, _next_states, next_states_full, _dones) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.config.update_every if self.t_step == 0: if len(self.memory) > self.config.batch_size: for a_i in range(self.config.num_agents): samples = self.memory.sample() self.update(samples, a_i) self.update_targets() def update_critic(self, samples, agent_number): """Update critic weights""" states, states_full, actions, rewards, next_states, next_states_full, dones = samples agent = self.maddpg_agent[agent_number] agent.critic_optimizer.zero_grad() # ---------------------------- update critic ---------------------- # actions_next = self.target_act(next_states) actions_next = torch.cat(actions_next, dim=1) Q_target_next = agent.target_critic(next_states_full, actions_next) Q_targets = rewards[:, agent_number].view(-1, 1) + self.config.gamma * \ Q_target_next * (1 - dones[:, agent_number].view(-1, 1)) Q_expected = agent.critic(states_full, actions.reshape(-1, self.action_num)) critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss.backward() agent.critic_optimizer.step() def update_actor(self, samples, agent_number): """Update actor weights""" states, states_full, actions, rewards, next_states, next_states_full, dones = samples agent = self.maddpg_agent[agent_number] agent.actor_optimizer.zero_grad() actions_pred = self.update_act(states, agent_number) actions_pred = torch.cat(actions_pred, dim=1) actor_loss = -agent.critic(states_full, actions_pred).mean() actor_loss.backward() agent.actor_optimizer.step() def update(self, samples, agent_number): """update the critics and actors of all the agents """ # ---------------------------- update critic ---------------------- # self.update_critic(samples, agent_number) # ---------------------------- update actor ------------------------- # self.update_actor(samples, agent_number) def update_targets(self): """soft update targets""" for ddpg_agent in self.maddpg_agent: soft_update(ddpg_agent.target_actor, ddpg_agent.actor, self.config.tau) soft_update(ddpg_agent.target_critic, ddpg_agent.critic, self.config.tau) def reset(self): """Resets weight of all agents""" for ddpg_agent in self.maddpg_agent: ddpg_agent.reset()
class TD3: def __init__(self, config: Config): self.config = config self.is_training = True # self.buffer = deque(maxlen=self.config.max_buff) self.buffer = ReplayBuffer(self.config.max_buff) self.actor = Actor(self.config.state_dim, self.config.action_dim, self.config.max_action) self.actor_target = Actor(self.config.state_dim, self.config.action_dim, self.config.max_action) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_1 = Critic(self.config.state_dim, self.config.action_dim) self.critic_1_target = Critic(self.config.state_dim, self.config.action_dim) self.critic_1_target.load_state_dict(self.critic_1.state_dict()) self.critic_1_optimizer = Adam(self.critic_1.parameters(), lr=self.config.learning_rate) self.critic_2 = Critic(self.config.state_dim, self.config.action_dim) self.critic_2_target = Critic(self.config.state_dim, self.config.action_dim) self.critic_2_target.load_state_dict(self.critic_2.state_dict()) self.critic_2_optimizer = Adam(self.critic_2.parameters(), lr=self.config.learning_rate) self.MseLoss = nn.MSELoss() if self.config.use_cuda: self.cuda() def act(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(device) action = self.actor(state) return action.cpu().data.numpy().flatten() #.detach() def learning(self, fr, t): for i in range(t): state, action_, reward, next_state, done = self.buffer.sample( self.config.batch_size) state = torch.tensor(state, dtype=torch.float).to(device) next_state = torch.tensor(next_state, dtype=torch.float).to(device) action = torch.tensor(action_, dtype=torch.float).to(device) reward = torch.tensor(reward, dtype=torch.float).reshape( (-1, 1)).to(device) done = torch.tensor(done, dtype=torch.float).reshape( (-1, 1)).to(device) # reward = torch.FloatTensor(reward).reshape((self.config.batch_size,1)).to(device) # done = torch.FloatTensor(done).reshape((self.config.batch_size,1)).to(device) # Select next action according to target policy: noise = torch.tensor(action_, dtype=torch.float).data.normal_( 0, self.config.policy_noise).to(device) noise = noise.clamp(-self.config.noise_clip, self.config.noise_clip) next_action = (self.actor_target(next_state) + noise) next_action = next_action.clamp(-self.config.max_action, self.config.max_action) # Compute target Q-value: target_Q1 = self.critic_1_target(next_state, next_action) target_Q2 = self.critic_2_target(next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + ( (1 - done) * self.config.gamma * target_Q).detach() # Optimize Critic 1: current_Q1 = self.critic_1(state, action) loss_Q1 = F.mse_loss(current_Q1, target_Q) self.critic_1_optimizer.zero_grad() loss_Q1.backward() self.critic_1_optimizer.step() # Optimize Critic 2: current_Q2 = self.critic_2(state, action) loss_Q2 = F.mse_loss(current_Q2, target_Q) self.critic_2_optimizer.zero_grad() loss_Q2.backward() self.critic_2_optimizer.step() # Delayed policy updates: if i % self.config.policy_delay == 0: # Compute actor loss: actor_loss = -self.critic_1(state, self.actor(state)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Polyak averaging update: for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_( (self.config.polyak * target_param.data) + ((1 - self.config.polyak) * param.data)) for param, target_param in zip( self.critic_1.parameters(), self.critic_1_target.parameters()): target_param.data.copy_( (self.config.polyak * target_param.data) + ((1 - self.config.polyak) * param.data)) for param, target_param in zip( self.critic_2.parameters(), self.critic_2_target.parameters()): target_param.data.copy_( (self.config.polyak * target_param.data) + ((1 - self.config.polyak) * param.data)) def cuda(self): self.actor.to(device) self.actor_target.to(device) self.critic_1.to(device) self.critic_1_target.to(device) self.critic_2.to(device) self.critic_2_target.to(device) def load_weights(self, model_path): policy = torch.load(model_path) if 'actor' in policy: self.actor.load_state_dict(policy['actor']) else: self.actor.load_state_dict(policy) def save_model(self, output, name=''): torch.save(self.actor.state_dict(), '%s/actor_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_policy' os.makedirs(checkpath, exist_ok=True) torch.save( { 'frames': fr, 'actor': self.actor.state_dict(), 'critic_1': self.critic_1.state_dict(), 'critic_2': self.critic_2.state_dict(), }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.actor.load_state_dict(checkpoint['actor']) self.critic_1.load_state_dict(checkpoint['critic_1']) self.critic_2.load_state_dict(checkpoint['critic_2']) return fr
class DQNAgent: """ DQN Agent, valid for discrete actioin space """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #loss_fn = nn.MSELoss() loss_fn = nn.SmoothL1Loss() iter = 0 def __init__(self, net, o_dim, a_dim, lr=1e-3, batch_size=16, algorithm="ddqn", gamma=0.99, tau=1e-3, buffer_size=int(1e6)): """ o_dim: observation space dim (or # of channels) a_dim: action space dimension """ self.o_dim = o_dim self.a_dim = a_dim self.lr = lr self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size if algorithm.lower() in ("dqn"): self.algorithm = "dqn" elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"): self.algorithm = "ddqn" else: raise TypeError("cannot recognize algorithm") self.buffer = ReplayBuffer(buffer_size, batch_size) self.online_net = net(o_dim, a_dim).to(self.device) self.target_net = net(o_dim, a_dim).to(self.device) self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr) def get_action(self, state, eps=0.): """ Epsilon-greedy action selection """ if random.random() > eps: state_tensor = torch.FloatTensor(state).unsqueeze(0).to( self.device) self.online_net.eval() with torch.no_grad(): action = self.online_net(state_tensor).argmax(1).item() self.online_net.train() return action else: return random.choice(np.arange(self.a_dim)) def update(self, experiences): states, actions, rewards, next_states, dones = experiences states = torch.FloatTensor(states).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) actions = torch.LongTensor(actions).view(-1, 1).to(self.device) rewards = torch.FloatTensor(rewards).view(-1, 1).to(self.device) dones = torch.FloatTensor(dones).view(-1, 1).to(self.device) if self.algorithm == "ddqn": max_actions = self.online_net(next_states).max(1)[1].view(-1, 1) Q_next = self.target_net(next_states).gather(1, max_actions) elif self.algorithm == "dqn": Q_next = self.target_net(next_states).max(1)[0].view(-1, 1) else: raise TypeError("cannot recognize algorithm") Q_targets = rewards + self.gamma * Q_next * (1. - dones) Q_expected = self.online_net(states).gather(1, actions) loss = self.loss_fn(Q_expected, Q_targets.detach()) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), 10.) self.optimizer.step() def step(self, state, action, reward, next_state, done): self.buffer.push(state, action, reward, next_state, done) if len(self.buffer) > self.batch_size: experiences = self.buffer.sample() self.update(experiences) soft_update(self.target_net, self.online_net, self.tau) self.iter += 1
class CnnDDQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_state_values = self.target_model(s1).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather(1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) # Notice that detach the expected_q_value loss = (q_value - expected_q_value.detach()).pow(2).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() if fr % self.config.update_tar_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def cuda(self): self.model.cuda() self.target_model.cuda() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_model(self, output, name=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_model' os.makedirs(checkpath, exist_ok=True) torch.save({ 'frames': fr, 'model': self.model.state_dict() }, '%s/checkpoint_fr_%d.tar'% (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.model.load_state_dict(checkpoint['model']) self.target_model.load_state_dict(checkpoint['model']) return fr
class DQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_value = next_q_values.max(1)[0] q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) # Notice that detach the expected_q_value loss = (q_value - expected_q_value.detach()).pow(2).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() return loss.item() def cuda(self): self.model.cuda() def load_weights(self, model_path): if model_path is None: return self.model.load_state_dict(torch.load(model_path)) def save_model(self, output, tag=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n")
def train_main(exp_prefix="", fc_units=[128, 64, 64], env_list=[], num_envs=10, num_obstacls_ratio=[0.2, 0.3, 0.3, 0.2], n_step=1, max_episodes=10000, max_steps=120, per_num_envs=8, replay_buffer_len=400, no_replay=False, batch_size=64, learning_rate=1e-4, epsilon_min=0.05, epsilon_max=0.10, gamma=0.98, without_map_info=False, save_interval=1000, show=False): # create envs if len(env_list) == 0: env_list = create_or_load_envs(num_envs, num_obstacls_ratio) # create model if without_map_info: state_dims = 2 + 1 else: state_dims = 4 * (2 + 2) + 6 + 2 + 2 act_dims = 5 model = DQNModel(state_dims=state_dims, act_dims=act_dims, fc_units=fc_units) print("create model done") # optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate) # create replay buffer buffer = ReplayBuffer(replay_buffer_len) print("create buffer done") # construct save path suffix weight_dir = os.path.join("weights", exp_prefix) dir_util.mkpath(weight_dir) log_dir = os.path.join("logs", exp_prefix) dir_util.mkpath(log_dir) summary_writer = tf.summary.create_file_writer(log_dir) # run simulations mean_loss_vals = [] mean_ep_rewards = [] last_save_ep_idx = 0 for ep in range(max_episodes // per_num_envs): if no_replay: buffer.clear() num_new_samples = 0 ep_rewards = [] # randomly select an env and run rollout envs = np.random.choice(env_list, size=(per_num_envs)) env_indices = np.random.randint(len(env_list), size=(per_num_envs)) for roll_idx, env_idx in enumerate(env_indices): env = env_list[env_idx] episode_index = ep * per_num_envs + roll_idx epsilon = epsilon_max - ( epsilon_max - epsilon_min) / max_episodes * episode_index ship_state_trace, input_states, action_list, reward_list, done_list, is_random_act_list, qvals = run_one_episodes( env, model, epsilon, max_steps, without_map_info) # td_errors = (reward_list + qvals[1:] * gamma) - qvals[:-1] td_errors = get_n_step_estimated_qvals(reward_list, qvals[1:], gamma, n_step) - qvals[:-1] buffer.add_items(input_states, action_list, reward_list, done_list, td_errors) num_new_samples += len(input_states) ep_rewards.append(np.sum(reward_list)) print( "episode {:4d}, env-{:03d}, epsilon: {:4.2f}, episode length: {:3d}, ep_reward: {:8.2f}" .format(episode_index, env_idx, epsilon, len(input_states), np.sum(reward_list))) tot_ep_reward = np.sum(reward_list) avg_ep_reward = np.mean(reward_list) with summary_writer.as_default(): tf.summary.scalar('tot_ep_reward_trn', tot_ep_reward, step=episode_index) tf.summary.scalar('avg_ep_reward_trn', avg_ep_reward, step=episode_index) if episode_index % 100 == 0: # run an evaluation (eval_ship_state_trace, eval_input_states, eval_action_list, eval_reward_list, eval_done_list, eval_is_random_act_list, eval_qval_list) = run_one_episodes(env, model, 0, max_steps, without_map_info) # log episode reward with summary_writer.as_default(): eval_tot_ep_reward = np.sum(eval_reward_list) eval_avg_ep_reward = np.mean(eval_reward_list) tf.summary.scalar('tot_ep_reward_evl', eval_tot_ep_reward, step=episode_index) tf.summary.scalar('avg_ep_reward_evl', eval_avg_ep_reward, step=episode_index) # eval the loss eval_states_curr = np.array(eval_input_states[:-1]) eval_states_next = np.array(eval_input_states[1:]) eval_qvals_next = model(eval_states_next, training=False).numpy() eval_qvals_next_max = np.amax( eval_qvals_next, axis=1) * (1 - np.array(eval_done_list)) eval_qvals_esti = get_n_step_estimated_qvals( eval_reward_list, eval_qvals_next_max, gamma, n_step) # to tensor eval_states_curr = tf.convert_to_tensor( eval_states_curr, tf.float32) eval_action_list_tf = tf.convert_to_tensor(eval_action_list) eval_qvals_esti = tf.convert_to_tensor(eval_qvals_esti, tf.float32) # eval to get loss eval_loss = eval_step_v0(model, eval_states_curr, eval_action_list_tf, eval_qvals_esti).numpy() with summary_writer.as_default(): tf.summary.scalar('loss_evl', eval_loss, step=episode_index) # draw map and state trace env.show(eval_ship_state_trace, np.sum(eval_reward_list), eval_loss, eval_action_list, eval_is_random_act_list, save_path="pictures", prefix=exp_prefix, count=episode_index) # run update avg_ep_reward = float(np.mean(ep_rewards)) mean_ep_rewards.append(avg_ep_reward) curr_update_loss_vals = [] if no_replay: num_updates = 1 else: num_updates = max( 1, min(num_new_samples, replay_buffer_len) // batch_size) for _ in range(num_updates): # get qvals of next states if no_replay: batch_size = max(1, int(num_new_samples * 0.8)) # overwrite batch_size states_curr, states_next, actions, rewards, dones = buffer.sample( batch_size) states_next = tf.convert_to_tensor(states_next, tf.float32) qvals_next = model(states_next, training=False).numpy() qvals_next = np.amax(qvals_next, axis=1) * (1 - dones) qvals_esti = get_n_step_estimated_qvals(rewards, qvals_next, gamma, n_step) # to tensor states_curr = tf.convert_to_tensor(states_curr, tf.float32) actions = tf.convert_to_tensor(actions) qvals_esti = tf.convert_to_tensor(qvals_esti, tf.float32) # do an update loss_trn = train_step_v0(model, optimizer, states_curr, actions, qvals_esti).numpy() with summary_writer.as_default(): tf.summary.scalar('loss_trn', loss_trn, step=episode_index) curr_update_loss_vals.append(loss_trn) print("episode {:4d}, bs: {:4d}, loss_trn: {:6.2f}".format( episode_index, batch_size, loss_trn)) mean_loss_vals.append(float(np.mean(curr_update_loss_vals))) # draw loss if ep > 0 and ep % 10 == 0: draw_vals(mean_ep_rewards, mean_loss_vals, per_num_envs, exp_prefix=exp_prefix) # save to file for further use json.dump([mean_loss_vals, mean_ep_rewards], open("logs/{}_logs_info.json".format(exp_prefix), "w")) # Save the weights using the `checkpoint_path` format if (episode_index - last_save_ep_idx) > save_interval: save_path = os.path.join( weight_dir, "weights_{:05d}.ckpt".format(episode_index)) model.save_weights(save_path) last_save_ep_idx = episode_index print("episode-{}, save weights to: {}".format( episode_index, save_path))
class CnnDDQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): if self.config.prioritized_replay: experience = self.buffer.sample(self.config.batch_size, beta=self.beta_schedule.value(fr)) (s0, a, r, s1, done, weights, batch_idxes) = experience else: s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) weights, batch_idxes = np.ones_like(r), None s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) weights = torch.tensor(weights, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() weights = weights.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_state_values = self.target_model(s1).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) td_errors = next_q_value - expected_q_value # Notice that detach the expected_q_value loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none') loss = (loss * weights).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() if self.config.prioritized_replay: new_priorities = np.abs(td_errors.detach().cpu().numpy() ) + self.config.prioritized_replay_eps self.buffer.update_priorities(batch_idxes, new_priorities) if fr % self.config.update_tar_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def cuda(self): self.model.cuda() self.target_model.cuda() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_model(self, output, name=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_model' os.makedirs(checkpath, exist_ok=True) torch.save({ 'frames': fr, 'model': self.model.state_dict() }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.model.load_state_dict(checkpoint['model']) self.target_model.load_state_dict(checkpoint['model']) return fr
def main(): seeding() number_of_episodes = 20000 episode_length = 1000 batchsize = 256 save_interval = 1000 rewards_deque = deque(maxlen=100) rewards_all = [] noise = 1.0 noise_reduction = 1.0 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) """ Info about the UnityEnvironment brain_name: 'TennisBrain' brain: ['brain_name', 'camera_resolutions', 'num_stacked_vector_observations', 'number_visual_observations', 'vector_action_descriptions', 'vector_action_space_size', 'vector_action_space_type', 'vector_observation_space_size', 'vector_observation_space_type']] """ env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) # ------------------------------ training ------------------------------ # # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() for episode in range(1, number_of_episodes + 1): timer.update(episode) rewards_this_episode = np.zeros((2, )) """ Info about the UnityEnvironment env_info: ['agents', 'local_done', 'max_reached', 'memories', 'previous_text_actions', 'previous_vector_actions', 'rewards', 'text_observations', 'vector_observations', 'visual_observations'] actions: List(num_agents=2, action_size=2) states: List((24,), (24,)) rewards: List(2,) dones: List(2,) """ env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations for episode_t in range(episode_length): # reset the OUNoise for each agent. for i in range(2): maddpg.maddpg_agent[i].noise.reset() actions = maddpg.act(states, noise=noise) env_info = env.step(actions)[brain_name] noise *= noise_reduction next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (states, actions, rewards, next_states, dones) buffer.push(transition) rewards_this_episode += rewards states = next_states if any(dones): break # update the local and target network if len(buffer) > batchsize: # update the local network for _ in range(5): for a_i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network maddpg.update_targets() rewards_all.append(rewards_this_episode) rewards_deque.append(np.max(rewards_this_episode)) average_score = np.mean(rewards_deque) # --------------------- Logging for TensorBoard --------------------- # logger.add_scalars('rewards', { 'agent0': rewards_this_episode[0], 'agent1': rewards_this_episode[1] }, episode) logger.add_scalars('global', { 'score': np.max(rewards_this_episode), 'average_score': average_score }, episode) # -------------------------- Save the model -------------------------- # save_dict_list = [] if episode % save_interval == 0 or average_score >= 0.5: for i in range(2): save_dict = \ {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) if average_score >= 3.0: print('\nEnvironment solved in {} episodes!'.format(episode - 100)) print('\nAverage Score: {:.2f}'.format(average_score)) break env.close() logger.close() timer.finish()