def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" # Best self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy.load_state_dict(torch.load(savePath))
class DecoupledA3CAgent: def __init__(self, env, gamma, lr, global_max_episode): self.env = env self.gamma = gamma self.lr = lr self.global_episode = mp.Value('i', 0) self.GLOBAL_MAX_EPISODE = global_max_episode self.global_value_network = ValueNetwork( self.env.observation_space.shape[0], 1) self.global_value_network.share_memory() self.global_policy_network = PolicyNetwork( self.env.observation_space.shape[0], self.env.action_space.n) self.global_policy_network.share_memory() self.global_value_optimizer = optim.Adam( self.global_value_network.parameters(), lr=lr) self.global_policy_optimizer = optim.Adam( self.global_policy_network.parameters(), lr=lr) self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\ self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())] def train(self): print("Training on {} cores".format(mp.cpu_count())) input("Enter to start") [worker.start() for worker in self.workers] [worker.join() for worker in self.workers] def save_model(self): torch.save(self.global_value_network.state_dict(), "a3c_value_model.pth") torch.save(self.global_policy_network.state_dict(), "a3c_policy_model.pth")
def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global()
def __init__(self, env_id, action_space, trajectory_size=256, n_envs=1, max_timesteps=1500): self.env_id = env_id self.n_envs = n_envs self.trajectory_size = trajectory_size self.vecenv = VecEnv(env_id=self.env_id, n_envs=self.n_envs, max_timesteps=max_timesteps) self.policy = PolicyNetwork(action_space=action_space) self.old_policy = PolicyNetwork(action_space=action_space) self.critic = CriticNetwork() self.r_running_stats = util.RunningStats(shape=(action_space, )) self._init_network()
def __init__(self, env, render, config_info): self.env = env self.render = render self._reset_env() # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # Define models hidden_size = param["model"]["hidden_size"] self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device) self.target_q_net = QNetwork(state_dim, num_actions, hidden_size).to( self.device ) self.target_q_net.load_state_dict(self.q_net.state_dict()) self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to( self.device ) # Define loss criterion self.q_criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.alpha = param["training"]["alpha"]
def __init__(self): self.policy = PolicyNetwork(action_space=self.ACTION_SPACE) self.value_network = ValueNetwork() self.env = gym.make(self.ENV_ID) self.global_steps = 0 self.history = [] self.hiscore = None
def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # entropy temperature self.alpha = alpha self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) self.replay_buffer = BasicBuffer(buffer_maxlen)
def main(args): env = gym.make(args.env_name) device = torch.device(args.device) # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # 2.Create actor, critic, EnvSampler() and TRPO. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor = PolicyNetwork(state_size, action_size, hidden_sizes=args.hidden_sizes, init_std=args.init_std) critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes) env_sampler = EnvSampler(env, args.max_episode_step) trpo = TRPO(actor, critic, args.value_lr, args.value_steps_per_update, args.cg_steps, args.linesearch_steps, args.gamma, args.tau, args.damping, args.max_kl, device) def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) action = actor.select_action(state) return action.detach().cpu().numpy()[0] total_step = 0 for episode in range(1, args.episodes + 1): episode_reward, samples = env_sampler(get_action, args.batch_size) actor_loss, value_loss = trpo.update(*samples) yield episode * args.batch_size, episode_reward, actor_loss, value_loss
def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr)
def __init__(self, state_size, action_size, action_dim, config): self.state_size = state_size self.action_size = action_size self.action_dim = action_dim self.seed = 0 self.device = 'cuda' self.batch_size = config["batch_size"] self.lr = 0.005 self.gamma = 0.99 self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.R_local = RNetwork(state_size, action_size, self.seed).to(self.device) self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device) self.policy = PolicyNetwork(state_size, action_size, self.seed).to(self.device) self.predicter = Classifier(state_size, action_dim, self.seed).to(self.device) #self.criterion = nn.CrossEntropyLoss() # optimizer self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr) pathname = "lr {} batch_size {} seed {}".format( self.lr, self.batch_size, self.seed) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.ratio = 1. / action_dim self.all_actions = [] for a in range(self.action_dim): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
def __init__(self, env, gamma, lr, global_max_episode): self.env = env self.gamma = gamma self.lr = lr self.global_episode = mp.Value('i', 0) self.GLOBAL_MAX_EPISODE = global_max_episode self.global_value_network = ValueNetwork( self.env.observation_space.shape[0], 1) self.global_policy_network = PolicyNetwork( self.env.observation_space.shape[0], self.env.action_space.n) self.global_value_optimizer = optim.Adam( self.global_value_network.parameters(), lr=lr) self.global_policy_optimizer = optim.Adam( self.global_policy_network.parameters(), lr=lr) self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\ self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())]
def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen)
def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init")
class OldSACAgent: def __init__(self, env, render, config_info): self.env = env self.render = render self._reset_env() # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # Define models hidden_size = param["model"]["hidden_size"] self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device) self.v_net = VNetwork(state_dim, hidden_size).to(self.device) self.target_v_net = VNetwork(state_dim, hidden_size).to(self.device) self.target_v_net.load_state_dict(self.v_net.state_dict()) self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to( self.device ) # Define loss criterion self.q_criterion = nn.MSELoss() self.v_criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr) self.v_opt = optim.Adam(self.v_net.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.alpha = param["training"]["alpha"] def _reset_env(self): # Reset the environment and initialize episode reward self.state, self.done = self.env.reset(), False self.episode_reward = 0.0 self.episode_step = 0 def train(self): # Main training loop total_timestep = 0 all_episode_rewards = [] all_mean_rewards = [] update = 0 # Create tensorboard writer writer = SummaryWriter(log_dir=self.path_runs, comment="-sac") for episode in itertools.count(1, 1): self._reset_env() while not self.done: # trick to improve exploration at the start of training if self.start_step > total_timestep: action = self.env.action_space.sample() # Sample random action else: action = self.policy_net.get_action( self.state, self.device ) # Sample action from policy # Fill the replay buffer up with transitions if len(self.replay_buffer) > self.batch_size: batch = self.replay_buffer.sample_buffer(self.batch_size) # Update parameters of all the networks q_loss, v_loss, policy_loss = self.train_on_batch(batch) writer.add_scalar("loss/q", q_loss, update) writer.add_scalar("loss/v", v_loss, update) writer.add_scalar("loss/policy", policy_loss, update) update += 1 if self.render: self.env.render() # Perform one step in the environment next_state, reward, self.done, _ = self.env.step(action) total_timestep += 1 self.episode_step += 1 self.episode_reward += reward # Create a tuple for the new transition new_transition = self.transition( self.state, action, reward, self.done, next_state ) # Append transition to the replay buffer self.replay_buffer.store_transition(new_transition) self.state = next_state if total_timestep > self.max_timesteps: break mean_reward = np.mean(all_episode_rewards[-100:]) all_episode_rewards.append(self.episode_reward) all_mean_rewards.append(mean_reward) print( "Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; " "reward {} ; mean reward {}".format( episode, total_timestep, self.max_timesteps, self.episode_step, round(self.episode_reward, 2), round(mean_reward, 2), ) ) writer.add_scalar("reward", self.episode_reward, episode) writer.add_scalar("mean reward", mean_reward, episode) # Save networks' weights path_critic = os.path.join(self.path_runs, "critic.pth") path_actor = os.path.join(self.path_runs, "actor.pth") torch.save(self.q_net.state_dict(), path_critic) torch.save(self.policy_net.state_dict(), path_actor) # Plot reward self.plot_reward(all_episode_rewards, all_mean_rewards) # Close all writer.close() self.env.close() def train_on_batch(self, batch_samples): # Unpack batch_size of transitions randomly drawn from the replay buffer ( state_batch, action_batch, reward_batch, done_int_batch, next_state_batch, ) = batch_samples # Transform np arrays into tensors and send them to device state_batch = torch.tensor(state_batch).to(self.device) next_state_batch = torch.tensor(next_state_batch).to(self.device) action_batch = torch.tensor(action_batch).to(self.device) reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device) done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to(self.device) q_value, _ = self.q_net(state_batch, action_batch) value = self.v_net(state_batch) pi, log_pi = self.policy_net.sample(state_batch) ### Update Q target_next_value = self.target_v_net(next_state_batch) next_q_value = ( reward_batch + (1 - done_int_batch) * self.gamma * target_next_value ) q_loss = self.q_criterion(q_value, next_q_value.detach()) ### Update V q_pi, _ = self.q_net(state_batch, pi) next_value = q_pi - log_pi v_loss = self.v_criterion(value, next_value.detach()) ### Update policy log_pi_target = q_pi - value policy_loss = (log_pi * (log_pi - log_pi_target).detach()).mean() # Losses and optimizers self.q_opt.zero_grad() q_loss.backward() self.q_opt.step() self.v_opt.zero_grad() v_loss.backward() self.v_opt.step() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() soft_update(self.target_v_net, self.v_net, self.tau) return q_loss.item(), v_loss.item(), policy_loss.item() def plot_reward(self, data, mean_data): plt.plot(data, label="reward") plt.plot(mean_data, label="mean reward") plt.xlabel("Episode") plt.ylabel("Reward") plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment") plt.tight_layout() plt.legend() path_fig = os.path.join(self.path_runs, "figure.png") plt.savefig(path_fig) print(f"Figure saved to {path_fig}") plt.show()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory, adv_method): """ When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = logits probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() # 0 for MC, 1 for TD if adv_method == 0: advantages = value_targets - values if adv_method == 1: advantages = rewards - values + self.gamma * torch.cat( (values[1:], torch.FloatTensor([[0]])), dim=0) policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantages.detach() policy_loss = policy_loss.sum() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory, adv_method): value_loss, policy_loss = self.compute_loss(trajectory, adv_method) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
def run_no_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes): # no baseline best_result = 0 best_settings = dict() results_file = f'results/s{stochasticity}_no_baseline.csv' best_settings_file = f'results/s{stochasticity}_no_baseline_best_settings.pkl' with open(results_file, 'w') as f: f.write('discount_factor,learn_rate,hidden_dim,init_temp,result' + '\n') for discount_factor in discount_factors: for learn_rate in learn_rates: for hidden_dim in hidden_dims: for init_temp in init_temps: print('#' * 30) print('#' * 9 + ' NEW SEARCH ' + '#' * 9) print('#' * 30) print() st = time() # change this for learned baseline print( f'Search settings: baseline=run_episodes_no_baseline, discount_factor={discount_factor}, learn_rate={learn_rate}, hidden_dim={hidden_dim}, init_temp={init_temp}' ) # initialize the environment env = gym.make('CartPole-v1') # <---------- change this! result = 0 for i in range(n_runs): start_time = time() policy_model = PolicyNetwork( input_dim=4, hidden_dim=hidden_dim, output_dim=2 ) # change input_ and output_dim for gridworld env seed = 40 + i set_seeds(env, seed) episode_durations, _ = run_episodes_no_baseline( policy_model, env, n_episodes, discount_factor, learn_rate, init_temp, stochasticity) result += np.mean(episode_durations) del policy_model end_time = time() h, m, s = get_running_time(end_time - start_time) print( f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' ) env.close() result /= n_runs with open(results_file, 'a') as f: f.write( f'{discount_factor},{learn_rate},{hidden_dim},{init_temp},{result}' + '\n') et = time() h, m, s = get_running_time(et - st) print( f'Done with search in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' ) print(f'Average number of steps per episode: {result}') if result > best_result: best_result = result best_settings['discount_factor'] = discount_factor best_settings['learn_rate'] = learn_rate best_settings['hidden_dim'] = hidden_dim best_settings['init_temp'] = init_temp best_settings['result'] = best_result pkl.dump(best_settings, open(best_settings_file, 'wb')) print(f'New best result!: {result}') print(f'New best settings!: {best_settings}') print() print() print() print(f'Best settings after completing grid search: {best_settings}') # Choose what to run by uncommenting #run_no_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes) #run_learned_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes) #run_selfcritic_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes)
class Agent(): def __init__(self, state_size, action_size, action_dim, config): self.state_size = state_size self.action_size = action_size self.action_dim = action_dim self.seed = 0 self.device = 'cuda' self.batch_size = config["batch_size"] self.lr = 0.005 self.gamma = 0.99 self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.R_local = RNetwork(state_size, action_size, self.seed).to(self.device) self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device) self.policy = PolicyNetwork(state_size, action_size, self.seed).to(self.device) self.predicter = Classifier(state_size, action_dim, self.seed).to(self.device) #self.criterion = nn.CrossEntropyLoss() # optimizer self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr) pathname = "lr {} batch_size {} seed {}".format( self.lr, self.batch_size, self.seed) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.ratio = 1. / action_dim self.all_actions = [] for a in range(self.action_dim): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def act(self, state): dis, action, log_probs, ent = self.policy.sample_action( torch.Tensor(state).unsqueeze(0)) return dis, action, log_probs, ent def learn(self, memory): states, next_states, actions = memory.expert_policy(self.batch_size) # actions = actions[0] # print("states ", states) self.state_action_frq(states, actions) self.get_action_prob(states, actions) self.compute_r_function(states, actions) return # compute difference between Q_shift and y_sh q_sh_value = self.q_shift_local(next_states, actions) y_sh = np.empty((self.batch_size, 1), dtype=np.float32) for idx, s in enumerate(next_states): q = [] for action in self.all_actions: q.append(Q_target(s.unsqueeze(0), action.unsqueeze(0))) q_max = max(q) np.copyto(y_sh[idx], q_max.detach().numpy()) y_sh = torch.Tensor(y_sh) y_sh *= self.gamma q_shift_loss = F.mse_loss(y_sh, q_shift_values) # Minimize the loss self.optimizer.zero_grad() q_shift_loss.backward() self.optimizer.step() #minimize MSE between pred Q and y = r'(s,a) + gama * max Q'(s',a) q_current = self.Q_local(states, actions) r_hat = self.R_target(states, actions) # use y_sh as target y_q = r_hat + y_sh q_loss = F.mse_loss(q_current, y_q) # Minimize the loss self.optimizer.zero_grad() q_loss.backward() self.optimizer.step() # get predicted reward r = self.R_local(states, actions) def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.steps += 1 output = self.predicter(states) # create one hot encode y from actions y = action.type(torch.long) y = y.squeeze(1) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() loss.backward() self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) def get_action_prob(self, states, actions, dim=False): """ """ if dim: output = self.predicter(states) action_prob = output.gather(1, actions.type(torch.long)) action_prob = torch.log(action_prob) return action_prob output = self.predicter(states) print("Output prob ", output) action_prob = output.gather(1, actions.type(torch.long)) print("action prob ", action_prob) action_prob = torch.log(action_prob) print("action prob ", action_prob) return action_prob def compute_r_function(self, states, actions): """ """ actions = actions.type(torch.float) y = self.R_local(states, actions) y_shift = self.q_shift_target(states, actions) y_r_part1 = self.get_action_prob(states, actions) - y_shift print("ratio ", self.ratio) # sum all other actions y_r_part2 = torch.empty((self.batch_size, 1), dtype=torch.float32) idx = 0 for a, s in zip(actions, states): y_h = 0 for b in self.all_actions: if torch.eq(a, b): continue print("diff ac ", b) r_hat = self.R_target(s.unsqueeze(0), b.unsqueeze(0)) n_b = self.get_action_prob(s.unsqueeze(0), b.unsqueeze(0), True) - self.q_shift_target( s.unsqueeze(0), b.unsqueeze(0)) y_h += (r_hat - n_b) y_h = self.ratio * y_h y_r_part2[idx] = y_h idx += 1 print("shape of r y ", y.shape) print("y r part 1 ", y_r_part1.shape) print("y r part 2 ", y_r_part2.shape)
def run_selfcritic_baseline(stochasticity, n_runs, n_episodes): # self-critic baseline dir_path = os.path.dirname(os.path.realpath(__file__)) best_settings_file = dir_path + f'/cart_pole_parameter_search/s{stochasticity}_SC_baseline_best_settings.pkl' eval_file = f'cart_evals/s{stochasticity}_SC_baseline.pkl' with open(best_settings_file, 'rb') as pickle_file: best_settings = pkl.load(pickle_file) discount_factor = best_settings['discount_factor'] learn_rate = best_settings['learn_rate'] hidden_dim = best_settings['hidden_dim'] init_temp = best_settings['init_temp'] st = time() # change this for learned baseline print( f'Run settings: baseline=run_episodes_with_SC_baseline, discount_factor={discount_factor}, learn_rate={learn_rate}, hidden_dim={hidden_dim}, init_temp={init_temp}' ) # initialize the environment env = gym.make('CartPole-v1') episode_durations_list = [] reinforce_loss_list = [] for i in range(n_runs): start_time = time() policy_model = PolicyNetwork( input_dim=4, hidden_dim=hidden_dim, output_dim=2) # change input_ and output_dim for gridworld env seed = 40 + i set_seeds(env, seed) episode_durations, reinforce_loss = run_episodes_with_SC_baseline( policy_model, env, n_episodes, discount_factor, learn_rate, init_temp, stochasticity) episode_durations_list.append(episode_durations) reinforce_loss_list.append(reinforce_loss) del policy_model end_time = time() h, m, s = get_running_time(end_time - start_time) print( f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' ) env.close() et = time() h, m, s = get_running_time(et - st) evals = {} evals['episode_durations'] = episode_durations_list evals['reinforce_loss'] = reinforce_loss_list pkl.dump(evals, open(eval_file, 'wb')) print( f'Done with runs in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds' )
class DRTRPOAgent(): """ DR TRPO """ def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_adv_mc(self, trajectory): """ Compute the advantage of all (st,at) in trajectory. The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN) """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) advantages = value_targets - values return advantages, value_loss def compute_adv_td(self, state, next_state, reward): """ Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN """ state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) reward = torch.as_tensor(reward) state_value = self.value_network.forward(state) next_state_value = self.value_network.forward(next_state) value_target = reward + next_state_value advantage = value_target - state_value value_loss = F.mse_loss(state_value, value_target) return advantage, value_loss def compute_policy_loss_kl(self, state, state_adv, beta): """ Policy loss of DR TRPO (KL Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) denom = torch.sum(torch.exp(state_adv / beta) * pi_dist) new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom return F.mse_loss(pi_dist, new_pi_dist) def compute_policy_loss_wass(self, state, state_adv, beta): """ Policy loss of DR TRPO (Wasserstein Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) """Find argmax_j {A(s,aj) - β*d(aj,ai)}.""" best_j = [] for i in range(self.action_dim): opt_j = 0 opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i) for j in range(self.action_dim): cur_val = state_adv[j] - beta * self.compute_distance(j, i) if cur_val > opt_val: opt_j = j opt_val = cur_val best_j.append(opt_j) new_pi_dist = torch.zeros(self.action_dim) for j in range(self.action_dim): for i in range(self.action_dim): if j == best_j[i]: new_pi_dist[j] += pi_dist[i] return F.mse_loss(pi_dist, new_pi_dist) def compute_distance(self, a1, a2): if a1 == a2: return 0 else: return 1 def update(self, value_loss, policy_loss): self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class DecoupledWorker(mp.Process): def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global() def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.local_policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.local_value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.local_policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update_global(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.global_value_optimizer.zero_grad() value_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_value_network.parameters(), self.global_value_network.parameters()): global_params._grad = local_params._grad self.global_value_optimizer.step() self.global_policy_optimizer.zero_grad() policy_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_policy_network.parameters(), self.global_policy_network.parameters()): global_params._grad = local_params._grad #print(global_params._grad) self.global_policy_optimizer.step() def sync_with_global(self): self.local_value_network.load_state_dict( self.global_value_network.state_dict()) self.local_policy_network.load_state_dict( self.global_policy_network.state_dict()) def run(self): state = self.env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 while self.global_episode.value < self.GLOBAL_MAX_EPISODE: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: with self.global_episode.get_lock(): self.global_episode.value += 1 print(self.name + " | episode: " + str(self.global_episode.value) + " " + str(episode_reward)) self.update_global(trajectory) self.sync_with_global() trajectory = [] episode_reward = 0 state = self.env.reset() else: state = next_state
class TRPOAgent: TRAJECTORY_SIZE = 1024 VF_BATCHSIZE = 64 MAX_KL = 0.01 GAMMA = 0.99 GAE_LAMBDA = 0.98 ENV_ID = "Pendulum-v0" OBS_SPACE = 3 ACTION_SPACE = 1 def __init__(self): self.policy = PolicyNetwork(action_space=self.ACTION_SPACE) self.value_network = ValueNetwork() self.env = gym.make(self.ENV_ID) self.global_steps = 0 self.history = [] self.hiscore = None def play(self, n_iters): self.epi_reward = 0 self.epi_steps = 0 self.state = self.env.reset() for _ in range(n_iters): trajectory = self.generate_trajectory() trajectory = self.compute_advantage(trajectory) self.update_policy(trajectory) self.update_vf(trajectory) return self.history def generate_trajectory(self): """generate trajectory on current policy """ trajectory = { "s": np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32), "a": np.zeros((self.TRAJECTORY_SIZE, self.ACTION_SPACE), dtype=np.float32), "r": np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32), "s2": np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32), "done": np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32) } state = self.state for i in range(self.TRAJECTORY_SIZE): action = self.policy.sample_action(state) next_state, reward, done, _ = self.env.step(action) trajectory["s"][i] = state trajectory["a"][i] = action trajectory["r"][i] = reward trajectory["s2"][i] = next_state trajectory["done"][i] = done self.epi_reward += reward self.epi_steps += 1 self.global_steps += 1 if done: state = self.env.reset() self.history.append(self.epi_reward) recent_score = sum(self.history[-10:]) / 10 print("====" * 5) print("Episode:", len(self.history)) print("Episode reward:", self.epi_reward) print("Global steps:", self.global_steps) if len(self.history) > 100 and (self.hiscore is None or recent_score > self.hiscore): print("*HISCORE UPDATED:", recent_score) self.save_model() self.hiscore = recent_score self.epi_reward = 0 self.epi_steps = 0 else: state = next_state self.state = state return trajectory def compute_advantage(self, trajectory): """Compute Args: trajectory ([type]): [description] """ trajectory["vpred"] = self.value_network(trajectory["s"]).numpy() trajectory["vpred_next"] = self.value_network(trajectory["s2"]).numpy() is_nonterminals = 1 - trajectory["done"] deltas = trajectory["r"] + self.GAMMA * is_nonterminals * trajectory[ "vpred_next"] - trajectory["vpred"] advantages = np.zeros_like(deltas, dtype=np.float32) lastgae = 0 for i in reversed(range(len(deltas))): lastgae = deltas[ i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[i] * lastgae advantages[i] = lastgae trajectory["adv"] = (advantages - advantages.mean()) / (advantages.std() + 1e-8) #trajectory["adv"] = advantages trajectory["vftarget"] = trajectory["adv"] + trajectory["vpred"] return trajectory def update_policy(self, trajectory): def flattengrads(grads): flatgrads_list = [ tf.reshape(grad, shape=[1, -1]) for grad in grads ] flatgrads = tf.concat(flatgrads_list, axis=1) return flatgrads actions = tf.convert_to_tensor(trajectory["a"], dtype=tf.float32) states = tf.convert_to_tensor(trajectory["s"], dtype=tf.float32) advantages = tf.convert_to_tensor(trajectory["adv"], dtype=tf.float32) old_means, old_stdevs = self.policy(states) old_logp = compute_logprob(old_means, old_stdevs, actions) with tf.GradientTape() as tape: new_means, new_stdevs = self.policy(states) new_logp = compute_logprob(new_means, new_stdevs, actions) loss = tf.exp(new_logp - old_logp) * advantages loss = tf.reduce_mean(loss) g = tape.gradient(loss, self.policy.trainable_variables) g = tf.transpose(flattengrads(g)) @tf.function def hvp_func(vector): """Compute hessian-vector product """ with tf.GradientTape() as t2: with tf.GradientTape() as t1: new_means, new_stdevs = self.policy(states) kl = compute_kl(old_means, old_stdevs, new_means, new_stdevs) meankl = tf.reduce_mean(kl) kl_grads = t1.gradient(meankl, self.policy.trainable_variables) kl_grads = flattengrads(kl_grads) grads_vector_product = tf.matmul(kl_grads, vector) hvp = t2.gradient(grads_vector_product, self.policy.trainable_variables) hvp = tf.transpose(flattengrads(hvp)) return hvp + vector * 1e-2 #: 共役勾配法の安定化のために微小量を加える step_direction = cg(hvp_func, g) shs = tf.matmul(tf.transpose(step_direction), hvp_func(step_direction)) lm = tf.sqrt(2 * self.MAX_KL / shs) fullstep = lm * step_direction expected_improve = tf.matmul(tf.transpose(g), fullstep) fullstep = restore_shape(fullstep, self.policy.trainable_variables) params_old = [var.numpy() for var in self.policy.trainable_variables] old_loss = loss for stepsize in [0.5**i for i in range(10)]: params_new = [ p + step * stepsize for p, step in zip(params_old, fullstep) ] self.policy.set_weights(params_new) new_means, new_stdevs = self.policy(states) new_logp = compute_logprob(new_means, new_stdevs, actions) new_loss = tf.reduce_mean(tf.exp(new_logp - old_logp) * advantages) improve = new_loss - old_loss kl = compute_kl(old_means, old_stdevs, new_means, new_stdevs) mean_kl = tf.reduce_mean(kl) print(f"Expected: {expected_improve} Actual: {improve}") print(f"KL {mean_kl}") if mean_kl > self.MAX_KL * 1.5: print("violated KL constraint. shrinking step.") elif improve < 0: print("surrogate didn't improve. shrinking step.") else: print("Stepsize OK!") break else: print("更新に失敗") self.policy.set_weights(params_old) def update_vf(self, trajectory): for _ in range(self.TRAJECTORY_SIZE // self.VF_BATCHSIZE): indx = np.random.choice(self.TRAJECTORY_SIZE, self.VF_BATCHSIZE, replace=True) with tf.GradientTape() as tape: vpred = self.value_network(trajectory["s"][indx]) vtarget = trajectory["vftarget"][indx] loss = tf.reduce_mean(tf.square(vtarget - vpred)) variables = self.value_network.trainable_variables grads = tape.gradient(loss, variables) self.value_network.optimizer.apply_gradients(zip(grads, variables)) def save_model(self): self.policy.save_weights("checkpoints/actor") self.value_network.save_weights("checkpoints/critic") print() print("Model Saved") print() def load_model(self): self.policy.load_weights("checkpoints/actor") self.value_network.load_weights("checkpoints/critic") def test_play(self, n, monitordir, load_model=False): if load_model: self.load_model() if monitordir: env = wrappers.Monitor(gym.make(self.ENV_ID), monitordir, force=True, video_callable=(lambda ep: ep % 1 == 0)) else: env = gym.make(self.ENV_ID) for i in range(n): total_reward = 0 steps = 0 done = False state = env.reset() while not done: action = self.policy.sample_action(state) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print() print(f"Test Play {i}: {total_reward}") print(f"Steps:", steps) print()
class SACAgent: def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [0, 250] self.obs_dim = env.state_dim self.action_dim = env.action_dim # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # entropy temperature self.alpha = alpha self.target_entropy = -torch.prod( torch.Tensor([self.action_dim, 1]).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi expected_q = rewards + (1 - dones) * self.gamma * next_q_target # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update q networks self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy network and target q networks new_actions, log_pi = self.policy_net.sample(states) if self.update_step % self.delay_step == 0: min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (self.alpha * log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # update temperature alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() self.update_step += 1
class SACAgent(): def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int, critic_lr: float, actor_lr: float, reward_scale: int): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the action self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.reward_scale = reward_scale # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=self.critic_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=self.critic_lr) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int): # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = self.reward_scale * torch.FloatTensor(rewards).to( self.device) # in SAC we do reward scaling for the sampled rewards next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss) # Please refer to equation (6) in the paper for details # Sample actions for the next states (s_t+1) using the current policy next_actions, next_log_pi, _, _ = self.policy.sample( next_states, self.scale) next_actions = self.rescale_action(next_actions) # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) min_q = torch.min(next_q1, next_q2) # find minimum between next_q1 and next_q2 # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi)) next_q_target = (min_q - next_log_pi) # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) # Find expected Q, i.e., r(t) + gamma*next_q_target expected_q = rewards + (1 - dones) * self.gamma * next_q_target # Compute loss between Q network and expected Q q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # Policy update (computing the loss) # Sample new actions for the current states (s_t) using the current policy new_actions, log_pi, _, _ = self.policy.sample(states, self.scale) new_actions = self.rescale_action(new_actions) # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks new_q1 = self.q_net1.forward(states, new_actions) new_q2 = self.q_net2.forward(states, new_actions) min_q = torch.min(new_q1, new_q2) # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7) policy_loss = (log_pi - min_q).mean() # Backpropagate the losses and update policy network parameters self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Updating target networks with soft update using update rate tau for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_action( self, state: np.ndarray, stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action)) state = torch.FloatTensor(state).unsqueeze(0).to(self.device) # Get mean and sigma from the policy network mean, log_std = self.policy.forward(state) std = log_std.exp() # Stochastic mode is used for training, non-stochastic mode is used for evaluation if stochastic: normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() else: normal = Normal(mean, 0) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() # return a rescaled action, and also the mean and standar deviation of the action # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value return self.rescale_action(action), mean, std def rescale_action(self, action: np.ndarray) -> np.ndarray: # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" torch.save(self.policy.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" # Best self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy.load_state_dict(torch.load(savePath))
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() #delayed update for policy net and target value nets if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int, critic_lr: float, actor_lr: float, reward_scale: int): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the action self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.reward_scale = reward_scale # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=self.critic_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=self.critic_lr) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen)
class PPOAgent: GAMMA = 0.99 GAE_LAMBDA = 0.95 CLIPRANGE = 0.2 OPT_ITER = 20 BATCH_SIZE = 2048 def __init__(self, env_id, action_space, trajectory_size=256, n_envs=1, max_timesteps=1500): self.env_id = env_id self.n_envs = n_envs self.trajectory_size = trajectory_size self.vecenv = VecEnv(env_id=self.env_id, n_envs=self.n_envs, max_timesteps=max_timesteps) self.policy = PolicyNetwork(action_space=action_space) self.old_policy = PolicyNetwork(action_space=action_space) self.critic = CriticNetwork() self.r_running_stats = util.RunningStats(shape=(action_space, )) self._init_network() def _init_network(self): env = gym.make(self.env_id) state = np.atleast_2d(env.reset()) self.policy(state) self.old_policy(state) def run(self, n_updates, logdir): self.summary_writer = tf.summary.create_file_writer(str(logdir)) history = {"steps": [], "scores": []} states = self.vecenv.reset() hiscore = None for epoch in range(n_updates): for _ in range(self.trajectory_size): actions = self.policy.sample_action(states) next_states = self.vecenv.step(actions) states = next_states trajectories = self.vecenv.get_trajectories() for trajectory in trajectories: self.r_running_stats.update(trajectory["r"]) trajectories = self.compute_advantage(trajectories) states, actions, advantages, vtargs = self.create_minibatch( trajectories) vloss = self.update_critic(states, vtargs) self.update_policy(states, actions, advantages) global_steps = (epoch + 1) * self.trajectory_size * self.n_envs train_scores = np.array([traj["r"].sum() for traj in trajectories]) if epoch % 1 == 0: test_scores, total_steps = self.play(n=1) test_scores, total_steps = np.array(test_scores), np.array( total_steps) history["steps"].append(global_steps) history["scores"].append(test_scores.mean()) ma_score = sum(history["scores"][-10:]) / 10 with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores.mean(), step=epoch) tf.summary.scalar("test_steps", total_steps.mean(), step=epoch) print( f"Epoch {epoch}, {global_steps//1000}K, {test_scores.mean()}" ) if epoch // 10 > 10 and (hiscore is None or ma_score > hiscore): self.save_model() hiscore = ma_score print("Model Saved") with self.summary_writer.as_default(): tf.summary.scalar("value_loss", vloss, step=epoch) tf.summary.scalar("train_score", train_scores.mean(), step=epoch) return history def compute_advantage(self, trajectories): """ Generalized Advantage Estimation (GAE, 2016) """ for trajectory in trajectories: trajectory["v_pred"] = self.critic(trajectory["s"]).numpy() trajectory["v_pred_next"] = self.critic(trajectory["s2"]).numpy() is_nonterminals = 1 - trajectory["done"] normed_rewards = (trajectory["r"] / (np.sqrt(self.r_running_stats.var) + 1e-4)) deltas = normed_rewards + self.GAMMA * is_nonterminals * trajectory[ "v_pred_next"] - trajectory["v_pred"] advantages = np.zeros_like(deltas, dtype=np.float32) lastgae = 0 for i in reversed(range(len(deltas))): lastgae = deltas[ i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[ i] * lastgae advantages[i] = lastgae trajectory["advantage"] = advantages trajectory["R"] = advantages + trajectory["v_pred"] return trajectories def update_policy(self, states, actions, advantages): self.old_policy.set_weights(self.policy.get_weights()) indices = np.random.choice(range(states.shape[0]), (self.OPT_ITER, self.BATCH_SIZE)) for i in range(self.OPT_ITER): idx = indices[i] old_means, old_stdevs = self.old_policy(states[idx]) old_logprob = self.compute_logprob(old_means, old_stdevs, actions[idx]) with tf.GradientTape() as tape: new_means, new_stdevs = self.policy(states[idx]) new_logprob = self.compute_logprob(new_means, new_stdevs, actions[idx]) ratio = tf.exp(new_logprob - old_logprob) ratio_clipped = tf.clip_by_value(ratio, 1 - self.CLIPRANGE, 1 + self.CLIPRANGE) loss_unclipped = ratio * advantages[idx] loss_clipped = ratio_clipped * advantages[idx] loss = tf.minimum(loss_unclipped, loss_clipped) loss = -1 * tf.reduce_mean(loss) grads = tape.gradient(loss, self.policy.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.5) self.policy.optimizer.apply_gradients( zip(grads, self.policy.trainable_variables)) def update_critic(self, states, v_targs): losses = [] indices = np.random.choice(range(states.shape[0]), (self.OPT_ITER, self.BATCH_SIZE)) for i in range(self.OPT_ITER): idx = indices[i] old_vpred = self.critic(states[idx]) with tf.GradientTape() as tape: vpred = self.critic(states[idx]) vpred_clipped = old_vpred + tf.clip_by_value( vpred - old_vpred, -self.CLIPRANGE, self.CLIPRANGE) loss = tf.maximum(tf.square(v_targs[idx] - vpred), tf.square(v_targs[idx] - vpred_clipped)) loss = tf.reduce_mean(loss) grads = tape.gradient(loss, self.critic.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.5) self.critic.optimizer.apply_gradients( zip(grads, self.critic.trainable_variables)) losses.append(loss) return np.array(losses).mean() @tf.function def compute_logprob(self, means, stdevs, actions): """ガウス分布の確率密度関数よりlogp(x)を計算 logp(x) = -0.5 log(2π) - log(std) -0.5 * ((x - mean) / std )^2 """ logprob = -0.5 * np.log(2 * np.pi) logprob += -tf.math.log(stdevs) logprob += -0.5 * tf.square((actions - means) / stdevs) logprob = tf.reduce_sum(logprob, axis=1, keepdims=True) return logprob def create_minibatch(self, trajectories): states = np.vstack([traj["s"] for traj in trajectories]) actions = np.vstack([traj["a"] for traj in trajectories]) advantages = np.vstack([traj["advantage"] for traj in trajectories]) v_targs = np.vstack([traj["R"] for traj in trajectories]) return states, actions, advantages, v_targs def save_model(self): self.policy.save_weights("checkpoints/policy") self.critic.save_weights("checkpoints/critic") def load_model(self): self.policy.load_weights("checkpoints/policy") self.critic.load_weights("checkpoints/critic") def play(self, n=1, monitordir=None, verbose=False): if monitordir: env = wrappers.Monitor(gym.make(self.env_id), monitordir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_id) total_rewards = [] total_steps = [] for _ in range(n): state = env.reset() done = False total_reward = 0 steps = 0 while not done: steps += 1 action = self.policy.sample_action(state) next_state, reward, done, _ = env.step(action[0]) if verbose: mean, sd = self.policy(np.atleast_2d(state)) print(mean, sd) print(reward) total_reward += reward if done: break else: state = next_state total_rewards.append(total_reward) total_steps.append(steps) print() print(total_reward, steps) print() return total_rewards, total_steps
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init") def get_action(self, state): if state.shape != self.img_size: print( f"Invalid size, expected shape {self.img_size}, got {state.shape}" ) return None inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to( self.device) features = self.feature_net(inp) features = features.view(-1, self.input_size) mean, log_std = self.policy_net.forward(features) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # states and next states are lists of ndarrays, np.stack converts them to # ndarrays of shape (batch_size, height, width, num_channels) states = np.stack(states) next_states = np.stack(next_states) states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).permute(0, 3, 1, 2).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Process images features = self.feature_net( states) #.contiguous() # Properly shaped due to batching next_features = self.feature_net(next_states) #.contiguous() features = torch.reshape(features, (64, self.input_size)) next_features = torch.reshape(next_features, (64, self.input_size)) next_actions, next_log_pi = self.policy_net.sample(next_features) next_q1 = self.q_net1(next_features, next_actions) next_q2 = self.q_net2(next_features, next_actions) next_v = self.target_value_net(next_features) next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(features) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss expected_q = rewards + (1 - dones) * self.gamma * next_v curr_q1 = self.q_net1.forward(features, actions) curr_q2 = self.q_net2.forward(features, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value and q networks self.value_optimizer.zero_grad() v_loss.backward(retain_graph=True) self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward(retain_graph=True) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward(retain_graph=True) self.q2_optimizer.step() # delayed update for policy network and target q networks if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(features) min_q = torch.min(self.q_net1.forward(features, new_actions), self.q_net2.forward(features, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward(retain_graph=True) self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1