env = gym.make(env_id) env.seed(seed + rank) return env set_global_seeds(seed) return _init if __name__ == "__main__": envs = SubprocVecEnv([make_env(env_name, i) for i in range(num_envs)]) env = gym.make(env_name) num_inputs = envs.observation_space.shape num_outputs = envs.action_space.shape model = ActorCritic(num_inputs[0], num_outputs[0]).to(device) if os.path.isfile(modelpath): model.load_state_dict(torch.load(modelpath)) ppo = PPO(model=model, envs=envs, device=device, lr=lr, modelpath=modelpath) if not play_mode: ppo.ppo_train(num_steps, mini_batch_size, ppo_epochs, max_frames, max_pol_updates, save_interval,
class Agent: def __init__(self, cell_nb, lr=4e-3, nb_blocks=5, gamma=0.99): self.cell_nb = cell_nb self.gamma = gamma self.ActorCritic = ActorCritic(lr, cell_nb**2, nb_blocks) self.log_probs = None def choose_action(self, state): #here state is simply the current f_map state_tensor = torch.tensor([state]).to(self.ActorCritic.device) (mu, sigma), _ = self.ActorCritic.forward(state_tensor) actions = np.zeros(self.cell_nb, self.cell_nb) log_probs = np.zeros_like(actions) for ir, (mu_r, sig_r) in enumerate(zip(mu, sigma)): for ic, (mu_c, sig_c) in enumerate(zip(mu_r, sig_r)): #mu_c and sig_c are the mu and sigma parameter for the gaussian distribution of the current cell sig_c = torch.exp(sig_c) dist = torch.distributions.Normal(mu_c, sig_c) action = dist.sample() log_prob = dist.log_prob(action).to(self.ActorCritic.device) actions[ir, ic] = F.sigmoid(action.item( )) #bound the normalized transmit power between 0 and 1 log_probs[ ir, ic] = log_prob #for later, to calculate the actor loss self.log_probs = log_probs return actions def learn(self, episode): self.ActorCritic.optimizer.zero_grad() #s is the state, in the most simple case it is the f_map f_map = torch.tensor(episode["s"]).to( self.ActorCritic.device) #current f_map r = torch.tensor(episode["r"]).to( self.ActorCritic.device ) #the embeded objective function (sum-rate, capcity, SINR...) d = torch.tensor(episode["d"]).to( self.ActorCritic.device) #done, not really necessary f_map_ = torch.tensor(episode["s_"]).to( self.ActorCritic.device) #new f_map lg_p = torch.tensor(self.log_probs).to( self.ActorCritic.device) #log_probs as given by choose_action #get critic values for current and next state _, val = self.ActorCritic.forward(f_map) _, val_ = self.ActorCritic.forward(f_map_) #set the values for the next state to 0 if done val_[d] = 0.0 #compute the delta delta = r + self.gamma * val_ - val actor_loss = -torch.mean(lg_p.flatten() * delta) critic_loss = delta**2 (actor_loss + critic_loss).backward() self.ActorCritic.optimizer.step() def compute_loss(self, gains, policy): ps = np.array([d.getPowerFromPolicy(policy) for d in self.S.dList()]) H = gains rate = [ np.log(1 + (H[i, i]**2 * p_ / sum([H[i, j]**2 * p for j, p in enumerate(ps)]))) for i, p_ in enumerate(ps) ] return -np.sum(rate)
def __init__(self, cell_nb, lr=4e-3, nb_blocks=5, gamma=0.99): self.cell_nb = cell_nb self.gamma = gamma self.ActorCritic = ActorCritic(lr, cell_nb**2, nb_blocks) self.log_probs = None
return env set_global_seeds(seed) return _init if __name__ == "__main__": envs = SubprocVecEnv([make_env(env_name, i) for i in range(num_envs)]) env = gym.make(env_name) img_size = envs.observation_space[0].shape sensor_size = envs.observation_space[1].shape num_outputs = envs.action_space.shape model = ActorCritic([img_size[1], img_size[0]], sensor_size[0], num_outputs[0]).to(device) if args.onnx_converter and os.path.isfile(modelpath): model.load_state_dict(torch.load(modelpath)) model.export("gvsets_early_fusion.onnx") exit(1) if os.path.isfile(modelpath): model.load_state_dict(torch.load(modelpath)) ppo = PPO(model=model, envs=envs, device=device, lr=lr, modelpath=modelpath, tuple_ob=True)