def __init__(self, nn_module): self.eval_net, self.target_net = nn_module(), nn_module() self.eval_net.initialize_weights() self.target_net.load_state_dict(self.eval_net.state_dict()) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) self.loss_func = nn.MSELoss() self.consider_tasks, self.edges_num = CONSIDER_TASKS, EDGES_NUM self.bitrate_type, self.resolution_type = BIRATE_TYPE, RESOLUTION_TYPE self.learn_step_counter = 0 # for target updating self.memory_counter = 0 # for storing memory self.memory_size = MEMORY_SIZE self.memory = ReplayBuffer(MEMORY_SIZE)
class DDPG_Agent: """ DDPG Algorithm """ def __init__(self, state_size, action_size, actor_model, critic_model, device, num_agents=1, seed=0, tau=1e-3, batch_size=1024, discount_factor=0.99, actor_learning_rate=1e-4, critic_learning_rate=1e-3): """ Initialize the 4 networks Copy 2 of them into the other two: * actor and actor_target * critic and critic_target init the replay buffer and the noise process Args: state_size: action_size: num_agents: seed: tau: batch_size: discount_factor: actor_learning_rate: critic_learning_rate: """ self.tau = tau self.state_size = state_size self.action_size = action_size self.actor_local = actor_model(state_size, action_size, seed) self.actor_target = actor_model(state_size, action_size, seed) self.critic_local = critic_model(state_size, action_size, seed) self.critic_target = critic_model(state_size, action_size, seed) self.critic2_local = critic_model(state_size, action_size, seed + 1) self.critic2_target = critic_model(state_size, action_size, seed + 1) self.soft_update(1.0) self.batch_size = batch_size self.replayBuffer = ReplayBuffer(batch_size=batch_size, buffer_size=300 * 1000, seed=seed, device=device) self.num_agents = num_agents self.noise_process = OUNoise(action_size * num_agents, seed, max_sigma=0.1, min_sigma=0.001, decay_period=300 * 300) self.discount_factor = discount_factor self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=actor_learning_rate) self.critic_opt = optim.Adam(self.critic_local.parameters(), lr=critic_learning_rate) self.critic2_opt = optim.Adam(self.critic2_local.parameters(), lr=critic_learning_rate) self.critic_criterion = nn.MSELoss() self.critic2_criterion = nn.MSELoss() self.device = device for model in [ self.actor_local, self.actor_target, self.critic_local, self.critic_target, self.critic2_local, self.critic2_target ]: model.to(device) def act(self, state, add_noise=True): """ * Create actions using Actor Policy Network * Add noise to the actions and return it. Args: state: numpy array in shape of (num_agents, action_size). add_noise: Returns: actions_with_noise: numpy arrays of size (num_agents, action_size) actions_without_noise: numpy arrays of size (num_agents, action_size) """ state = torch.from_numpy(state).float().view( self.num_agents, self.state_size).to(self.device) self.actor_local.eval() actions_with_noise = None actions_without_noise = None with torch.no_grad(): actions = self.actor_local(state) actions_without_noise = actions.cpu().numpy() self.actor_local.train() if add_noise: actions_with_noise = actions_without_noise + self.noise_process.sample( ).reshape(self.num_agents, self.action_size) return actions_with_noise, actions_without_noise def step(self, state, action, reward, next_state, done): """ * save sample in the replay buffer * if replay buffer is large enough * learn Args: state: action: reward: next_state: done: Returns: None """ self.replayBuffer.push(state, action, reward, next_state, done) if len(self.replayBuffer) > self.batch_size: self.learn(*self.replayBuffer.sample()) def learn(self, states, actions, rewards, next_states, dones): """ * sample a batch * set y from reward, Target Critic Network and Target Policy network * Calculate loss from y and Critic Network * Update the actor policy (would also update the critic by chain rule) using sampled policy gradient * soft update the target critic and target policy Args: actions: rewards: next_states: dones: Returns: None """ # Update Critic next_actions = self.actor_target(next_states) # value = self.critic_target(next_states, next_actions).detach() value = (self.critic_target(next_states, next_actions).detach() + self.critic2_target(next_states, next_actions).detach()) / 2.0 # value = torch.min(self.critic_target(next_states, next_actions).detach(), # self.critic2_target(next_states, next_actions).detach()) y = rewards + self.discount_factor * value Q = self.critic_local(states, actions) critic_loss = self.critic_criterion(Q, y) Q2 = self.critic2_local(states, actions) critic2_loss = self.critic2_criterion(Q2, y) # Update Actor action_predictions = self.actor_local(states) actor_loss = -self.critic_local(states, action_predictions).mean() # update networks self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() self.critic2_opt.zero_grad() critic2_loss.backward() self.critic2_opt.step() self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # soft update self.soft_update(self.tau) def reset(self): self.noise_process.reset() def soft_update(self, tau): for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) for target_param, local_param in zip(self.critic2_target.parameters(), self.critic2_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, state_size, action_size, actor_model, critic_model, device, num_agents=1, seed=0, tau=1e-3, batch_size=1024, discount_factor=0.99, actor_learning_rate=1e-4, critic_learning_rate=1e-3): """ Initialize the 4 networks Copy 2 of them into the other two: * actor and actor_target * critic and critic_target init the replay buffer and the noise process Args: state_size: action_size: num_agents: seed: tau: batch_size: discount_factor: actor_learning_rate: critic_learning_rate: """ self.tau = tau self.state_size = state_size self.action_size = action_size self.actor_local = actor_model(state_size, action_size, seed) self.actor_target = actor_model(state_size, action_size, seed) self.critic_local = critic_model(state_size, action_size, seed) self.critic_target = critic_model(state_size, action_size, seed) self.critic2_local = critic_model(state_size, action_size, seed + 1) self.critic2_target = critic_model(state_size, action_size, seed + 1) self.soft_update(1.0) self.batch_size = batch_size self.replayBuffer = ReplayBuffer(batch_size=batch_size, buffer_size=300 * 1000, seed=seed, device=device) self.num_agents = num_agents self.noise_process = OUNoise(action_size * num_agents, seed, max_sigma=0.1, min_sigma=0.001, decay_period=300 * 300) self.discount_factor = discount_factor self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=actor_learning_rate) self.critic_opt = optim.Adam(self.critic_local.parameters(), lr=critic_learning_rate) self.critic2_opt = optim.Adam(self.critic2_local.parameters(), lr=critic_learning_rate) self.critic_criterion = nn.MSELoss() self.critic2_criterion = nn.MSELoss() self.device = device for model in [ self.actor_local, self.actor_target, self.critic_local, self.critic_target, self.critic2_local, self.critic2_target ]: model.to(device)
'-l', '--load', type=int, help="Indicates the step you wanna start, file must exist") args = parser.parse_args() start = 0 scenario = scenarios.load(args.scenario).Scenario() world = scenario.make_world() env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) policies = MultiAgent(env, "Multi-Agent") REPLAY_BUFFER = ReplayBuffer(env, GeneralConfig()) if args.load is not None: start = args.load if args.dir is None: print("[!] Please indicate a path for model storing") exit(1) policies.load(args.dir, start) for i in range(start, start + args.n_round): play(i, env, policies) if (i + 1) % args.every == 0: policies.save(args.dir, i)
class MADDPG: """ DDPG for multi and interactive agent Algorithm """ def __init__(self, state_size, action_size, actor_model, critic_model, device, num_agents=1, num_interacting_agents=2, seed=0, tau=1e-3, batch_size=1024, discount_factor=0.99, actor_learning_rate=1e-4, critic_learning_rate=1e-3, replayBuffer=None): """ Initialize the 4 networks Copy 2 of them into the other two: * actor and actor_target * critic and critic_target init the replay buffer and the noise process Args: state_size: action_size: num_agents: seed: tau: batch_size: discount_factor: actor_learning_rate: critic_learning_rate: """ self.tau = tau self.state_size = state_size self.action_size = action_size actor_layers = [32, 64, 64] critic_layers = [64, 16, 128, 16] # [64, 64, 256, 32] self.actor_local = actor_model(state_size, action_size, seed, layer_sizes=actor_layers) self.actor_target = actor_model(state_size, action_size, seed, layer_sizes=actor_layers) self.num_interacting_agents = num_interacting_agents self.critic_local = critic_model(state_size, state_size, action_size, action_size, seed, layer_sizes=critic_layers) self.critic_target = critic_model(state_size, state_size, action_size, action_size, seed, layer_sizes=critic_layers) self.critic2_local = critic_model(state_size, state_size, action_size, action_size, seed + 1, layer_sizes=critic_layers) self.critic2_target = critic_model(state_size, state_size, action_size, action_size, seed + 1, layer_sizes=critic_layers) self.soft_update(1.0) self.batch_size = batch_size if replayBuffer is None: self.replayBuffer = ReplayBuffer(batch_size=batch_size, buffer_size=300 * 1000, seed=seed, device=device) else: self.replayBuffer = replayBuffer self.num_agents = num_agents self.noise_process = OUNoise(action_size * num_agents, seed, max_sigma=0.1, min_sigma=0.001, decay_period=30 * 300, decay_delay=4048 / 30) self.discount_factor = discount_factor self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=actor_learning_rate) self.critic_opt = optim.Adam(self.critic_local.parameters(), lr=critic_learning_rate) self.critic2_opt = optim.Adam(self.critic2_local.parameters(), lr=critic_learning_rate) self.critic_criterion = nn.MSELoss() self.critic2_criterion = nn.MSELoss() self.device = device self.other = None for model in [ self.actor_local, self.actor_target, self.critic_local, self.critic_target, self.critic2_local, self.critic2_target ]: model.to(device) def set_other_agent(self, agent): self.other = agent def act(self, state, add_noise=True): """ * Create actions using Actor Policy Network * Add noise to the actions and return it. Args: state: numpy array in shape of (state_size,). add_noise: Returns: actions_with_noise: numpy arrays of size (num_agents, action_size) actions_without_noise: numpy arrays of size (num_agents, action_size) """ state = torch.from_numpy(state).float().view( self.num_agents, self.state_size).to(self.device) self.actor_local.eval() actions_with_noise = None actions_without_noise = None with torch.no_grad(): actions = self.actor_local(state) actions_without_noise = actions.cpu().numpy() self.actor_local.train() if add_noise: actions_with_noise = actions_without_noise + self.noise_process.sample( ).reshape(self.num_agents, self.action_size) return actions_with_noise, actions_without_noise def step(self, this_state, others_state, this_action, others_action, reward, this_next_states, others_next_states, done): """ * save sample in the replay buffer * if replay buffer is large enough * learn Args: this_state: 1D numpy array in shape of (1, num_states) others_state: 1D numpy array in shape of (1, num_states*num_other_agents) this_action: 1D numpy array in shape of (1, num_actions) others_action: D numpy array in shape of (1, num_actions*num_other_agents) reward: reward of this agent this_next_states: same as this_state but for next time stamp others_next_states: same as others_state but for next time stamp this_next_actions: same as this_action but for next time stamp others_next_actions: same as others_action but for next time stamp done: of this agent Returns: None """ # print(np.hstack((this_state, others_state))) # print(np.hstack((this_action, others_action))) # print(np.hstack((this_next_states, others_next_states))) # print(np.hstack((this_next_actions, others_next_actions))) # print(reward) self.replayBuffer.push(state=np.hstack((this_state, others_state)), action=np.hstack((this_action, others_action)), reward=reward, next_states=np.hstack( (this_next_states, others_next_states)), next_actions=None, done=done) if len(self.replayBuffer) > self.batch_size * 2: self.learn(*self.replayBuffer.sample()) def learn(self, states, actions, rewards, next_states, next_actions, dones): """ * set y from reward, Target Critic Network and Target Policy network * Calculate loss from y and Critic Network * Update the actor policy (would also update the critic by chain rule) using sampled policy gradient * soft update the target critic and target policy Args: states: state of all agents in a row actions: actions of all agents in a row rewards: rewards next_states: next state of all agents in a row next_actions: same as actions dones: dones Returns: None """ all_states = states this_state = states[:, 0:self.state_size] next_actions = self.actor_local(next_states[:, 0:self.state_size]) next_actions = torch.cat( (next_actions, self.other.actor_local(next_states[:, self.state_size:])), 1).detach() # Update Critic value = (self.critic_target(next_states, next_actions).detach() + self.critic2_target(next_states, next_actions).detach()) / 2.0 y = rewards + self.discount_factor * value Q = self.critic_local(all_states, actions) critic_loss = self.critic_criterion(Q, y) Q2 = self.critic2_local(all_states, actions) critic2_loss = self.critic2_criterion(Q2, y) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() self.critic2_opt.zero_grad() critic2_loss.backward() self.critic2_opt.step() # Update Actor action_predictions = self.actor_local(this_state) actions_pred_and_others = torch.cat( (action_predictions, next_actions[:, self.action_size:]), dim=1) actor_loss = ( -self.critic_local(all_states, actions_pred_and_others).mean() - self.critic2_local(all_states, actions_pred_and_others).mean()) * 0.5 self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # soft update self.soft_update(self.tau) # print(states.shape, this_state.shape, action_predictions.shape) def reset(self): self.noise_process.reset() def soft_update(self, tau): for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) for target_param, local_param in zip(self.critic2_target.parameters(), self.critic2_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_agent(self, file_name): i = 0 path = None while True: if not os.path.isfile(f'{file_name}_{i}.pth'): path = f'{file_name}_{i}.pth' break else: i += 1 torch.save( { 'actor_local': self.actor_local.state_dict(), 'critic_local': self.critic_local.state_dict(), 'critic2_local': self.critic2_local.state_dict(), 'actor_opt': self.actor_opt.state_dict(), 'critic_opt': self.critic_opt.state_dict(), 'critic2_opt': self.critic2_opt.state_dict() }, path) def load_agent(self, file_name, is_exact_path=False): i = 0 path = file_name while not is_exact_path: if os.path.isfile(f'{file_name}_{i}.pth'): path = f'{file_name}_{i}.pth' i += 1 else: break ckpt = torch.load(path) self.actor_local.load_state_dict(ckpt['actor_local']) self.critic_local.load_state_dict(ckpt['critic_local']) self.critic2_local.load_state_dict(ckpt['critic2_local']) self.actor_opt.load_state_dict(ckpt['actor_opt']) self.critic_opt.load_state_dict(ckpt['critic_opt']) self.critic2_opt.load_state_dict(ckpt['critic2_opt']) self.actor_local.to(self.device) self.critic_local.to(self.device) self.critic2_local.to(self.device)
class DQN(object): def __init__(self, nn_module): self.eval_net, self.target_net = nn_module(), nn_module() self.eval_net.initialize_weights() self.target_net.load_state_dict(self.eval_net.state_dict()) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) self.loss_func = nn.MSELoss() self.consider_tasks, self.edges_num = CONSIDER_TASKS, EDGES_NUM self.bitrate_type, self.resolution_type = BIRATE_TYPE, RESOLUTION_TYPE self.learn_step_counter = 0 # for target updating self.memory_counter = 0 # for storing memory self.memory_size = MEMORY_SIZE self.memory = ReplayBuffer(MEMORY_SIZE) def soft_update(self): self.target_net.load_state_dict(self.eval_net.state_dict()) return def anneal_epsilon(self, anneal=0.999): if self.eval_net.epsilon <= 0.1: return self.eval_net.epsilon = self.eval_net.epsilon * anneal def train(self): self.learn_step_counter += 1 if self.learn_step_counter < BATCH_SIZE or self.learn_step_counter % 5 != 0: return # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) # batch sample b_s, b_a, b_r, b_s_ = self.memory.sample(BATCH_SIZE) b_s = state2tensor(b_s, is_batch=True) b_a = [[bb[0], bb[1], bb[2]] for bb in b_a] # Adjust the subscript # q_eval w.r.t action in sample g = torch.from_numpy(np.array(b_a)[:, 0:2]) self.eval_net.train() if type(self.eval_net) == TSNet: q_eval = self.eval_net(b_s, g) # q_eval.shape (batch, data) b_a = [bb[-1:] for bb in b_a] else: q_eval = self.eval_net(b_s) b_a = [bb[:2] for bb in b_a] q_eval_wrt_a = torch.gather(q_eval, 1, index=torch.LongTensor(np.array(b_a))) q_eval_wrt_a = q_eval_wrt_a.sum(dim=1).unsqueeze(0).t() # q_target with the maximum q of next_state if type(self.eval_net) == TSNet: q_target = torch.FloatTensor(list(b_r)).unsqueeze(0).t() # else: b_s_ = state2tensor(b_s_, is_batch=True) q_next = self.target_net( b_s_).detach() # detach() from graph, don't back propagate max_q_next = torch.cat([ q_next[:, 0:BIRATE_TYPE].max(1)[0], q_next[:, -EDGES_NUM:].max(1)[0] ], dim=0).reshape(2, BATCH_SIZE).t() max_q_next = max_q_next.sum(dim=1).unsqueeze(0).t() b_r = torch.FloatTensor(list(b_r)).unsqueeze(0).t() q_target = b_r + DISCOUNT * max_q_next # MSELoss loss = self.loss_func(q_eval_wrt_a, q_target) self.optimizer.zero_grad() loss.backward() self.optimizer.step()