def Actor_load(self, WORKSPACE: str):
     # save 각 node별 모델 로드
     print("load the torch model")
     savePath = WORKSPACE + "./policy_model5_Hop_.pth"  # Best
     self.policy = PolicyNetwork(self.obs_dim,
                                 self.action_dim).to(self.device)
     self.policy.load_state_dict(torch.load(savePath))
Exemple #2
0
class DecoupledA3CAgent:
    def __init__(self, env, gamma, lr, global_max_episode):
        self.env = env

        self.gamma = gamma
        self.lr = lr
        self.global_episode = mp.Value('i', 0)
        self.GLOBAL_MAX_EPISODE = global_max_episode

        self.global_value_network = ValueNetwork(
            self.env.observation_space.shape[0], 1)
        self.global_value_network.share_memory()
        self.global_policy_network = PolicyNetwork(
            self.env.observation_space.shape[0], self.env.action_space.n)
        self.global_policy_network.share_memory()
        self.global_value_optimizer = optim.Adam(
            self.global_value_network.parameters(), lr=lr)
        self.global_policy_optimizer = optim.Adam(
            self.global_policy_network.parameters(), lr=lr)

        self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\
             self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())]

    def train(self):
        print("Training on {} cores".format(mp.cpu_count()))
        input("Enter to start")

        [worker.start() for worker in self.workers]
        [worker.join() for worker in self.workers]

    def save_model(self):
        torch.save(self.global_value_network.state_dict(),
                   "a3c_value_model.pth")
        torch.save(self.global_policy_network.state_dict(),
                   "a3c_policy_model.pth")
    def __init__(self, id, env, gamma, global_value_network,
                 global_policy_network, global_value_optimizer,
                 global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE):
        super(DecoupledWorker, self).__init__()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.name = "w%i" % id

        self.env = env
        self.env.seed(id)
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.local_value_network = ValueNetwork(self.obs_dim, 1)
        self.local_policy_network = PolicyNetwork(self.obs_dim,
                                                  self.action_dim)

        self.global_value_network = global_value_network
        self.global_policy_network = global_policy_network
        self.global_episode = global_episode
        self.global_value_optimizer = global_value_optimizer
        self.global_policy_optimizer = global_policy_optimizer
        self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE

        # sync local networks with global networks
        self.sync_with_global()
    def __init__(self,
                 env_id,
                 action_space,
                 trajectory_size=256,
                 n_envs=1,
                 max_timesteps=1500):

        self.env_id = env_id

        self.n_envs = n_envs

        self.trajectory_size = trajectory_size

        self.vecenv = VecEnv(env_id=self.env_id,
                             n_envs=self.n_envs,
                             max_timesteps=max_timesteps)

        self.policy = PolicyNetwork(action_space=action_space)

        self.old_policy = PolicyNetwork(action_space=action_space)

        self.critic = CriticNetwork()

        self.r_running_stats = util.RunningStats(shape=(action_space, ))

        self._init_network()
Exemple #5
0
    def __init__(self, env, render, config_info):
        self.env = env
        self.render = render
        self._reset_env()

        # Create run folder to store parameters, figures, and tensorboard logs
        self.path_runs = create_run_folder(config_info)

        # Extract training parameters from yaml config file
        param = load_training_parameters(config_info["config_param"])
        self.train_param = param["training"]

        # Define device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        # Define models
        hidden_size = param["model"]["hidden_size"]
        self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device)

        self.target_q_net = QNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )
        self.target_q_net.load_state_dict(self.q_net.state_dict())

        self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )

        # Define loss criterion
        self.q_criterion = nn.MSELoss()

        # Define optimizers
        lr = float(param["optimizer"]["learning_rate"])
        self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr)

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(param["training"]["replay_size"])

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

        # Useful variables
        self.batch_size = param["training"]["batch_size"]
        self.gamma = param["training"]["gamma"]
        self.tau = param["training"]["tau"]
        self.start_step = param["training"]["start_step"]
        self.max_timesteps = param["training"]["max_timesteps"]
        self.alpha = param["training"]["alpha"]
    def __init__(self):

        self.policy = PolicyNetwork(action_space=self.ACTION_SPACE)

        self.value_network = ValueNetwork()

        self.env = gym.make(self.ENV_ID)

        self.global_steps = 0

        self.history = []

        self.hiscore = None
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
Exemple #8
0
def main(args):
    env = gym.make(args.env_name)
    device = torch.device(args.device)

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # 2.Create actor, critic, EnvSampler() and TRPO.
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor = PolicyNetwork(state_size,
                          action_size,
                          hidden_sizes=args.hidden_sizes,
                          init_std=args.init_std)
    critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes)
    env_sampler = EnvSampler(env, args.max_episode_step)
    trpo = TRPO(actor, critic, args.value_lr, args.value_steps_per_update,
                args.cg_steps, args.linesearch_steps, args.gamma, args.tau,
                args.damping, args.max_kl, device)

    def get_action(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = actor.select_action(state)
        return action.detach().cpu().numpy()[0]

    total_step = 0
    for episode in range(1, args.episodes + 1):
        episode_reward, samples = env_sampler(get_action, args.batch_size)
        actor_loss, value_loss = trpo.update(*samples)
        yield episode * args.batch_size, episode_reward, actor_loss, value_loss
Exemple #9
0
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)
Exemple #10
0
 def __init__(self, state_size, action_size, action_dim, config):
     self.state_size = state_size
     self.action_size = action_size
     self.action_dim = action_dim
     self.seed = 0
     self.device = 'cuda'
     self.batch_size = config["batch_size"]
     self.lr = 0.005
     self.gamma = 0.99
     self.q_shift_local = QNetwork(state_size, action_size,
                                   self.seed).to(self.device)
     self.q_shift_target = QNetwork(state_size, action_size,
                                    self.seed).to(self.device)
     self.Q_local = QNetwork(state_size, action_size,
                             self.seed).to(self.device)
     self.Q_target = QNetwork(state_size, action_size,
                              self.seed).to(self.device)
     self.R_local = RNetwork(state_size, action_size,
                             self.seed).to(self.device)
     self.R_target = RNetwork(state_size, action_size,
                              self.seed).to(self.device)
     self.policy = PolicyNetwork(state_size, action_size,
                                 self.seed).to(self.device)
     self.predicter = Classifier(state_size, action_dim,
                                 self.seed).to(self.device)
     #self.criterion = nn.CrossEntropyLoss()
     # optimizer
     self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(),
                                         lr=self.lr)
     self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr)
     self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
     self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
     self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                     lr=self.lr)
     pathname = "lr {} batch_size {} seed {}".format(
         self.lr, self.batch_size, self.seed)
     tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
     self.ratio = 1. / action_dim
     self.all_actions = []
     for a in range(self.action_dim):
         action = torch.Tensor(1) * 0 + a
         self.all_actions.append(action.to(self.device))
Exemple #11
0
    def __init__(self, env, gamma, lr, global_max_episode):
        self.env = env

        self.gamma = gamma
        self.lr = lr
        self.global_episode = mp.Value('i', 0)
        self.GLOBAL_MAX_EPISODE = global_max_episode

        self.global_value_network = ValueNetwork(
            self.env.observation_space.shape[0], 1)
        self.global_policy_network = PolicyNetwork(
            self.env.observation_space.shape[0], self.env.action_space.n)
        self.global_value_optimizer = optim.Adam(
            self.global_value_network.parameters(), lr=lr)
        self.global_policy_optimizer = optim.Adam(
            self.global_policy_network.parameters(), lr=lr)

        self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\
             self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())]
Exemple #12
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
Exemple #13
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.firsttime = 0

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        #self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]  #1

        self.conv_channels = 4
        self.kernel_size = (3, 3)

        self.img_size = (500, 500, 3)

        print("Diagnostics:")
        print(f"action_range: {self.action_range}")
        #print(f"obs_dim: {self.obs_dim}")
        print(f"action_dim: {self.action_dim}")

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.feature_net = FeatureExtractor(self.img_size[2],
                                            self.conv_channels,
                                            self.kernel_size).to(self.device)
        print("Feature net init'd successfully")

        input_dim = self.feature_net.get_output_size(self.img_size)
        self.input_size = input_dim[0] * input_dim[1] * input_dim[2]
        print(f"input_size: {self.input_size}")

        self.value_net = ValueNetwork(self.input_size, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.input_size,
                                             1).to(self.device)
        self.q_net1 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.input_size,
                                        self.action_dim).to(self.device)

        print("Finished initing all nets")

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        print("Finished copying targets")

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        print("Finished initing optimizers")

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        print("End of init")
class OldSACAgent:
    def __init__(self, env, render, config_info):
        self.env = env
        self.render = render
        self._reset_env()

        # Create run folder to store parameters, figures, and tensorboard logs
        self.path_runs = create_run_folder(config_info)

        # Extract training parameters from yaml config file
        param = load_training_parameters(config_info["config_param"])
        self.train_param = param["training"]

        # Define device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        # Define models
        hidden_size = param["model"]["hidden_size"]
        self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device)
        self.v_net = VNetwork(state_dim, hidden_size).to(self.device)
        self.target_v_net = VNetwork(state_dim, hidden_size).to(self.device)
        self.target_v_net.load_state_dict(self.v_net.state_dict())
        self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )

        # Define loss criterion
        self.q_criterion = nn.MSELoss()
        self.v_criterion = nn.MSELoss()

        # Define optimizers
        lr = float(param["optimizer"]["learning_rate"])
        self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr)
        self.v_opt = optim.Adam(self.v_net.parameters(), lr=lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr)

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(param["training"]["replay_size"])

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

        # Useful variables
        self.batch_size = param["training"]["batch_size"]
        self.gamma = param["training"]["gamma"]
        self.tau = param["training"]["tau"]
        self.start_step = param["training"]["start_step"]
        self.max_timesteps = param["training"]["max_timesteps"]
        self.alpha = param["training"]["alpha"]

    def _reset_env(self):
        # Reset the environment and initialize episode reward
        self.state, self.done = self.env.reset(), False
        self.episode_reward = 0.0
        self.episode_step = 0

    def train(self):
        # Main training loop
        total_timestep = 0
        all_episode_rewards = []
        all_mean_rewards = []
        update = 0

        # Create tensorboard writer
        writer = SummaryWriter(log_dir=self.path_runs, comment="-sac")

        for episode in itertools.count(1, 1):
            self._reset_env()

            while not self.done:
                # trick to improve exploration at the start of training
                if self.start_step > total_timestep:
                    action = self.env.action_space.sample()  # Sample random action
                else:
                    action = self.policy_net.get_action(
                        self.state, self.device
                    )  # Sample action from policy

                # Fill the replay buffer up with transitions
                if len(self.replay_buffer) > self.batch_size:
                    batch = self.replay_buffer.sample_buffer(self.batch_size)

                    # Update parameters of all the networks
                    q_loss, v_loss, policy_loss = self.train_on_batch(batch)
                    writer.add_scalar("loss/q", q_loss, update)
                    writer.add_scalar("loss/v", v_loss, update)
                    writer.add_scalar("loss/policy", policy_loss, update)
                    update += 1

                if self.render:
                    self.env.render()

                # Perform one step in the environment
                next_state, reward, self.done, _ = self.env.step(action)
                total_timestep += 1
                self.episode_step += 1
                self.episode_reward += reward

                # Create a tuple for the new transition
                new_transition = self.transition(
                    self.state, action, reward, self.done, next_state
                )

                # Append transition to the replay buffer
                self.replay_buffer.store_transition(new_transition)

                self.state = next_state

            if total_timestep > self.max_timesteps:
                break

            mean_reward = np.mean(all_episode_rewards[-100:])
            all_episode_rewards.append(self.episode_reward)
            all_mean_rewards.append(mean_reward)

            print(
                "Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; "
                "reward {} ; mean reward {}".format(
                    episode,
                    total_timestep,
                    self.max_timesteps,
                    self.episode_step,
                    round(self.episode_reward, 2),
                    round(mean_reward, 2),
                )
            )

            writer.add_scalar("reward", self.episode_reward, episode)
            writer.add_scalar("mean reward", mean_reward, episode)

        # Save networks' weights
        path_critic = os.path.join(self.path_runs, "critic.pth")
        path_actor = os.path.join(self.path_runs, "actor.pth")
        torch.save(self.q_net.state_dict(), path_critic)
        torch.save(self.policy_net.state_dict(), path_actor)

        # Plot reward
        self.plot_reward(all_episode_rewards, all_mean_rewards)

        # Close all
        writer.close()
        self.env.close()

    def train_on_batch(self, batch_samples):
        # Unpack batch_size of transitions randomly drawn from the replay buffer
        (
            state_batch,
            action_batch,
            reward_batch,
            done_int_batch,
            next_state_batch,
        ) = batch_samples

        # Transform np arrays into tensors and send them to device
        state_batch = torch.tensor(state_batch).to(self.device)
        next_state_batch = torch.tensor(next_state_batch).to(self.device)
        action_batch = torch.tensor(action_batch).to(self.device)
        reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device)
        done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to(self.device)

        q_value, _ = self.q_net(state_batch, action_batch)
        value = self.v_net(state_batch)
        pi, log_pi = self.policy_net.sample(state_batch)

        ### Update Q
        target_next_value = self.target_v_net(next_state_batch)
        next_q_value = (
            reward_batch + (1 - done_int_batch) * self.gamma * target_next_value
        )

        q_loss = self.q_criterion(q_value, next_q_value.detach())

        ### Update V
        q_pi, _ = self.q_net(state_batch, pi)
        next_value = q_pi - log_pi
        v_loss = self.v_criterion(value, next_value.detach())

        ### Update policy
        log_pi_target = q_pi - value
        policy_loss = (log_pi * (log_pi - log_pi_target).detach()).mean()

        # Losses and optimizers
        self.q_opt.zero_grad()
        q_loss.backward()
        self.q_opt.step()

        self.v_opt.zero_grad()
        v_loss.backward()
        self.v_opt.step()

        self.policy_opt.zero_grad()
        policy_loss.backward()
        self.policy_opt.step()

        soft_update(self.target_v_net, self.v_net, self.tau)

        return q_loss.item(), v_loss.item(), policy_loss.item()

    def plot_reward(self, data, mean_data):
        plt.plot(data, label="reward")
        plt.plot(mean_data, label="mean reward")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment")
        plt.tight_layout()
        plt.legend()

        path_fig = os.path.join(self.path_runs, "figure.png")
        plt.savefig(path_fig)
        print(f"Figure saved to {path_fig}")

        plt.show()
Exemple #15
0
class A2CAgent():
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        dist = logits
        probs = Categorical(dist)
        return probs.sample().cpu().detach().item()

    def compute_loss(self, trajectory, adv_method):
        """   
        When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. 
        When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. 
        """
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = logits
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        # 0 for MC, 1 for TD
        if adv_method == 0:
            advantages = value_targets - values
        if adv_method == 1:
            advantages = rewards - values + self.gamma * torch.cat(
                (values[1:], torch.FloatTensor([[0]])), dim=0)

        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantages.detach()
        policy_loss = policy_loss.sum() - 0.001 * entropy

        return value_loss, policy_loss

    def update(self, trajectory, adv_method):
        value_loss, policy_loss = self.compute_loss(trajectory, adv_method)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
class A2CAgent():
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(
                self.device)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    def update(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
Exemple #17
0
def run_no_baseline(discount_factors, learn_rates, hidden_dims, init_temps,
                    stochasticity, n_runs, n_episodes):
    # no baseline
    best_result = 0
    best_settings = dict()
    results_file = f'results/s{stochasticity}_no_baseline.csv'
    best_settings_file = f'results/s{stochasticity}_no_baseline_best_settings.pkl'

    with open(results_file, 'w') as f:
        f.write('discount_factor,learn_rate,hidden_dim,init_temp,result' +
                '\n')

    for discount_factor in discount_factors:
        for learn_rate in learn_rates:
            for hidden_dim in hidden_dims:
                for init_temp in init_temps:
                    print('#' * 30)
                    print('#' * 9 + ' NEW SEARCH ' + '#' * 9)
                    print('#' * 30)
                    print()

                    st = time()

                    # change this for learned baseline
                    print(
                        f'Search settings: baseline=run_episodes_no_baseline, discount_factor={discount_factor}, learn_rate={learn_rate}, hidden_dim={hidden_dim}, init_temp={init_temp}'
                    )

                    # initialize the environment
                    env = gym.make('CartPole-v1')  # <---------- change this!

                    result = 0

                    for i in range(n_runs):
                        start_time = time()

                        policy_model = PolicyNetwork(
                            input_dim=4, hidden_dim=hidden_dim, output_dim=2
                        )  # change input_ and output_dim for gridworld env
                        seed = 40 + i
                        set_seeds(env, seed)

                        episode_durations, _ = run_episodes_no_baseline(
                            policy_model, env, n_episodes, discount_factor,
                            learn_rate, init_temp, stochasticity)
                        result += np.mean(episode_durations)

                        del policy_model

                        end_time = time()
                        h, m, s = get_running_time(end_time - start_time)

                        print(
                            f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
                        )

                    env.close()
                    result /= n_runs

                    with open(results_file, 'a') as f:
                        f.write(
                            f'{discount_factor},{learn_rate},{hidden_dim},{init_temp},{result}'
                            + '\n')

                    et = time()
                    h, m, s = get_running_time(et - st)

                    print(
                        f'Done with search in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
                    )
                    print(f'Average number of steps per episode: {result}')

                    if result > best_result:
                        best_result = result
                        best_settings['discount_factor'] = discount_factor
                        best_settings['learn_rate'] = learn_rate
                        best_settings['hidden_dim'] = hidden_dim
                        best_settings['init_temp'] = init_temp
                        best_settings['result'] = best_result

                        pkl.dump(best_settings, open(best_settings_file, 'wb'))

                        print(f'New best result!: {result}')
                        print(f'New best settings!: {best_settings}')
                    print()

    print()
    print()
    print(f'Best settings after completing grid search: {best_settings}')


# Choose what to run by uncommenting
#run_no_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes)
#run_learned_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes)
#run_selfcritic_baseline(discount_factors, learn_rates, hidden_dims, init_temps, stochasticity, n_runs, n_episodes)
Exemple #18
0
class Agent():
    def __init__(self, state_size, action_size, action_dim, config):
        self.state_size = state_size
        self.action_size = action_size
        self.action_dim = action_dim
        self.seed = 0
        self.device = 'cuda'
        self.batch_size = config["batch_size"]
        self.lr = 0.005
        self.gamma = 0.99
        self.q_shift_local = QNetwork(state_size, action_size,
                                      self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size,
                                       self.seed).to(self.device)
        self.Q_local = QNetwork(state_size, action_size,
                                self.seed).to(self.device)
        self.Q_target = QNetwork(state_size, action_size,
                                 self.seed).to(self.device)
        self.R_local = RNetwork(state_size, action_size,
                                self.seed).to(self.device)
        self.R_target = RNetwork(state_size, action_size,
                                 self.seed).to(self.device)
        self.policy = PolicyNetwork(state_size, action_size,
                                    self.seed).to(self.device)
        self.predicter = Classifier(state_size, action_dim,
                                    self.seed).to(self.device)
        #self.criterion = nn.CrossEntropyLoss()
        # optimizer
        self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(),
                                            lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                        lr=self.lr)
        pathname = "lr {} batch_size {} seed {}".format(
            self.lr, self.batch_size, self.seed)
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.writer = SummaryWriter(tensorboard_name)
        self.steps = 0
        self.ratio = 1. / action_dim
        self.all_actions = []
        for a in range(self.action_dim):
            action = torch.Tensor(1) * 0 + a
            self.all_actions.append(action.to(self.device))

    def act(self, state):
        dis, action, log_probs, ent = self.policy.sample_action(
            torch.Tensor(state).unsqueeze(0))
        return dis, action, log_probs, ent

    def learn(self, memory):
        states, next_states, actions = memory.expert_policy(self.batch_size)
        # actions = actions[0]
        # print("states ",  states)
        self.state_action_frq(states, actions)
        self.get_action_prob(states, actions)
        self.compute_r_function(states, actions)
        return
        # compute difference between Q_shift and y_sh
        q_sh_value = self.q_shift_local(next_states, actions)
        y_sh = np.empty((self.batch_size, 1), dtype=np.float32)
        for idx, s in enumerate(next_states):
            q = []
            for action in self.all_actions:
                q.append(Q_target(s.unsqueeze(0), action.unsqueeze(0)))
            q_max = max(q)
            np.copyto(y_sh[idx], q_max.detach().numpy())

        y_sh = torch.Tensor(y_sh)
        y_sh *= self.gamma
        q_shift_loss = F.mse_loss(y_sh, q_shift_values)
        # Minimize the loss
        self.optimizer.zero_grad()
        q_shift_loss.backward()
        self.optimizer.step()

        #minimize MSE between pred Q and y = r'(s,a) + gama * max Q'(s',a)
        q_current = self.Q_local(states, actions)
        r_hat = self.R_target(states, actions)
        # use y_sh as target
        y_q = r_hat + y_sh

        q_loss = F.mse_loss(q_current, y_q)
        # Minimize the loss
        self.optimizer.zero_grad()
        q_loss.backward()
        self.optimizer.step()

        #  get predicted reward
        r = self.R_local(states, actions)

    def state_action_frq(self, states, action):
        """ Train classifer to compute state action freq
        """
        self.steps += 1
        output = self.predicter(states)
        # create one hot encode y from actions
        y = action.type(torch.long)
        y = y.squeeze(1)
        loss = nn.CrossEntropyLoss()(output, y)
        self.optimizer_pre.zero_grad()
        loss.backward()
        self.optimizer_pre.step()
        self.writer.add_scalar('Predict_loss', loss, self.steps)

    def get_action_prob(self, states, actions, dim=False):
        """

        """
        if dim:
            output = self.predicter(states)
            action_prob = output.gather(1, actions.type(torch.long))
            action_prob = torch.log(action_prob)
            return action_prob
        output = self.predicter(states)
        print("Output prob ", output)
        action_prob = output.gather(1, actions.type(torch.long))
        print("action prob ", action_prob)
        action_prob = torch.log(action_prob)
        print("action prob ", action_prob)
        return action_prob

    def compute_r_function(self, states, actions):
        """
        
        """
        actions = actions.type(torch.float)
        y = self.R_local(states, actions)
        y_shift = self.q_shift_target(states, actions)
        y_r_part1 = self.get_action_prob(states, actions) - y_shift
        print("ratio ", self.ratio)
        # sum all other actions
        y_r_part2 = torch.empty((self.batch_size, 1), dtype=torch.float32)
        idx = 0
        for a, s in zip(actions, states):
            y_h = 0
            for b in self.all_actions:
                if torch.eq(a, b):
                    continue
                print("diff ac ", b)
                r_hat = self.R_target(s.unsqueeze(0), b.unsqueeze(0))
                n_b = self.get_action_prob(s.unsqueeze(0), b.unsqueeze(0),
                                           True) - self.q_shift_target(
                                               s.unsqueeze(0), b.unsqueeze(0))
                y_h += (r_hat - n_b)
            y_h = self.ratio * y_h
            y_r_part2[idx] = y_h
            idx += 1
        print("shape of r y ", y.shape)
        print("y r part 1 ", y_r_part1.shape)
        print("y r part 2 ", y_r_part2.shape)
def run_selfcritic_baseline(stochasticity, n_runs, n_episodes):
    # self-critic baseline
    dir_path = os.path.dirname(os.path.realpath(__file__))
    best_settings_file = dir_path + f'/cart_pole_parameter_search/s{stochasticity}_SC_baseline_best_settings.pkl'
    eval_file = f'cart_evals/s{stochasticity}_SC_baseline.pkl'

    with open(best_settings_file, 'rb') as pickle_file:
        best_settings = pkl.load(pickle_file)
    discount_factor = best_settings['discount_factor']
    learn_rate = best_settings['learn_rate']
    hidden_dim = best_settings['hidden_dim']
    init_temp = best_settings['init_temp']

    st = time()

    # change this for learned baseline
    print(
        f'Run settings: baseline=run_episodes_with_SC_baseline, discount_factor={discount_factor}, learn_rate={learn_rate}, hidden_dim={hidden_dim}, init_temp={init_temp}'
    )

    # initialize the environment
    env = gym.make('CartPole-v1')

    episode_durations_list = []
    reinforce_loss_list = []

    for i in range(n_runs):
        start_time = time()

        policy_model = PolicyNetwork(
            input_dim=4, hidden_dim=hidden_dim,
            output_dim=2)  # change input_ and output_dim for gridworld env
        seed = 40 + i
        set_seeds(env, seed)

        episode_durations, reinforce_loss = run_episodes_with_SC_baseline(
            policy_model, env, n_episodes, discount_factor, learn_rate,
            init_temp, stochasticity)

        episode_durations_list.append(episode_durations)
        reinforce_loss_list.append(reinforce_loss)

        del policy_model

        end_time = time()
        h, m, s = get_running_time(end_time - start_time)

        print(
            f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
        )

    env.close()

    et = time()
    h, m, s = get_running_time(et - st)

    evals = {}
    evals['episode_durations'] = episode_durations_list
    evals['reinforce_loss'] = reinforce_loss_list

    pkl.dump(evals, open(eval_file, 'wb'))

    print(
        f'Done with runs in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
    )
Exemple #20
0
class DRTRPOAgent():
    """
    DR TRPO 
    """
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        dist = logits
        probs = Categorical(dist)
        return probs.sample().cpu().detach().item()

    def compute_adv_mc(self, trajectory):
        """
        Compute the advantage of all (st,at) in trajectory.
        The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN)
        """
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]
        value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to(
            self.device)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        advantages = value_targets - values
        return advantages, value_loss

    def compute_adv_td(self, state, next_state, reward):
        """
        Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN
        """
        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        reward = torch.as_tensor(reward)
        state_value = self.value_network.forward(state)
        next_state_value = self.value_network.forward(next_state)
        value_target = reward + next_state_value
        advantage = value_target - state_value
        value_loss = F.mse_loss(state_value, value_target)
        return advantage, value_loss

    def compute_policy_loss_kl(self, state, state_adv, beta):
        """
        Policy loss of DR TRPO (KL Constraint).
        """
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        pi_dist = logits
        state_adv = torch.FloatTensor(state_adv).to(self.device)
        denom = torch.sum(torch.exp(state_adv / beta) * pi_dist)
        new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom
        return F.mse_loss(pi_dist, new_pi_dist)

    def compute_policy_loss_wass(self, state, state_adv, beta):
        """
        Policy loss of DR TRPO (Wasserstein Constraint).
        """
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        pi_dist = logits
        state_adv = torch.FloatTensor(state_adv).to(self.device)
        """Find argmax_j {A(s,aj) - β*d(aj,ai)}."""
        best_j = []
        for i in range(self.action_dim):
            opt_j = 0
            opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i)
            for j in range(self.action_dim):
                cur_val = state_adv[j] - beta * self.compute_distance(j, i)
                if cur_val > opt_val:
                    opt_j = j
                    opt_val = cur_val
            best_j.append(opt_j)

        new_pi_dist = torch.zeros(self.action_dim)
        for j in range(self.action_dim):
            for i in range(self.action_dim):
                if j == best_j[i]:
                    new_pi_dist[j] += pi_dist[i]

        return F.mse_loss(pi_dist, new_pi_dist)

    def compute_distance(self, a1, a2):
        if a1 == a2:
            return 0
        else:
            return 1

    def update(self, value_loss, policy_loss):
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
class DecoupledWorker(mp.Process):
    def __init__(self, id, env, gamma, global_value_network,
                 global_policy_network, global_value_optimizer,
                 global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE):
        super(DecoupledWorker, self).__init__()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.name = "w%i" % id

        self.env = env
        self.env.seed(id)
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.local_value_network = ValueNetwork(self.obs_dim, 1)
        self.local_policy_network = PolicyNetwork(self.obs_dim,
                                                  self.action_dim)

        self.global_value_network = global_value_network
        self.global_policy_network = global_policy_network
        self.global_episode = global_episode
        self.global_value_optimizer = global_value_optimizer
        self.global_policy_optimizer = global_policy_optimizer
        self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE

        # sync local networks with global networks
        self.sync_with_global()

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.local_policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(
                self.device)

        # compute value loss
        values = self.local_value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.local_policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    def update_global(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.global_value_optimizer.zero_grad()
        value_loss.backward()
        # propagate local gradients to global parameters
        for local_params, global_params in zip(
                self.local_value_network.parameters(),
                self.global_value_network.parameters()):
            global_params._grad = local_params._grad
        self.global_value_optimizer.step()

        self.global_policy_optimizer.zero_grad()
        policy_loss.backward()
        # propagate local gradients to global parameters
        for local_params, global_params in zip(
                self.local_policy_network.parameters(),
                self.global_policy_network.parameters()):
            global_params._grad = local_params._grad
            #print(global_params._grad)
        self.global_policy_optimizer.step()

    def sync_with_global(self):
        self.local_value_network.load_state_dict(
            self.global_value_network.state_dict())
        self.local_policy_network.load_state_dict(
            self.global_policy_network.state_dict())

    def run(self):
        state = self.env.reset()
        trajectory = []  # [[s, a, r, s', done], [], ...]
        episode_reward = 0

        while self.global_episode.value < self.GLOBAL_MAX_EPISODE:
            action = self.get_action(state)
            next_state, reward, done, _ = self.env.step(action)
            trajectory.append([state, action, reward, next_state, done])
            episode_reward += reward

            if done:
                with self.global_episode.get_lock():
                    self.global_episode.value += 1
                print(self.name + " | episode: " +
                      str(self.global_episode.value) + " " +
                      str(episode_reward))

                self.update_global(trajectory)
                self.sync_with_global()

                trajectory = []
                episode_reward = 0
                state = self.env.reset()
            else:
                state = next_state
class TRPOAgent:

    TRAJECTORY_SIZE = 1024

    VF_BATCHSIZE = 64

    MAX_KL = 0.01

    GAMMA = 0.99

    GAE_LAMBDA = 0.98

    ENV_ID = "Pendulum-v0"

    OBS_SPACE = 3

    ACTION_SPACE = 1

    def __init__(self):

        self.policy = PolicyNetwork(action_space=self.ACTION_SPACE)

        self.value_network = ValueNetwork()

        self.env = gym.make(self.ENV_ID)

        self.global_steps = 0

        self.history = []

        self.hiscore = None

    def play(self, n_iters):

        self.epi_reward = 0

        self.epi_steps = 0

        self.state = self.env.reset()

        for _ in range(n_iters):

            trajectory = self.generate_trajectory()

            trajectory = self.compute_advantage(trajectory)

            self.update_policy(trajectory)

            self.update_vf(trajectory)

        return self.history

    def generate_trajectory(self):
        """generate trajectory on current policy
        """

        trajectory = {
            "s":
            np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32),
            "a":
            np.zeros((self.TRAJECTORY_SIZE, self.ACTION_SPACE),
                     dtype=np.float32),
            "r":
            np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32),
            "s2":
            np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32),
            "done":
            np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32)
        }

        state = self.state

        for i in range(self.TRAJECTORY_SIZE):

            action = self.policy.sample_action(state)

            next_state, reward, done, _ = self.env.step(action)

            trajectory["s"][i] = state

            trajectory["a"][i] = action

            trajectory["r"][i] = reward

            trajectory["s2"][i] = next_state

            trajectory["done"][i] = done

            self.epi_reward += reward

            self.epi_steps += 1

            self.global_steps += 1

            if done:
                state = self.env.reset()

                self.history.append(self.epi_reward)

                recent_score = sum(self.history[-10:]) / 10

                print("====" * 5)
                print("Episode:", len(self.history))
                print("Episode reward:", self.epi_reward)
                print("Global steps:", self.global_steps)

                if len(self.history) > 100 and (self.hiscore is None or
                                                recent_score > self.hiscore):
                    print("*HISCORE UPDATED:", recent_score)
                    self.save_model()
                    self.hiscore = recent_score

                self.epi_reward = 0

                self.epi_steps = 0

            else:
                state = next_state

        self.state = state

        return trajectory

    def compute_advantage(self, trajectory):
        """Compute

        Args:
            trajectory ([type]): [description]
        """

        trajectory["vpred"] = self.value_network(trajectory["s"]).numpy()

        trajectory["vpred_next"] = self.value_network(trajectory["s2"]).numpy()

        is_nonterminals = 1 - trajectory["done"]

        deltas = trajectory["r"] + self.GAMMA * is_nonterminals * trajectory[
            "vpred_next"] - trajectory["vpred"]

        advantages = np.zeros_like(deltas, dtype=np.float32)

        lastgae = 0
        for i in reversed(range(len(deltas))):
            lastgae = deltas[
                i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[i] * lastgae
            advantages[i] = lastgae

        trajectory["adv"] = (advantages -
                             advantages.mean()) / (advantages.std() + 1e-8)
        #trajectory["adv"] = advantages

        trajectory["vftarget"] = trajectory["adv"] + trajectory["vpred"]

        return trajectory

    def update_policy(self, trajectory):
        def flattengrads(grads):
            flatgrads_list = [
                tf.reshape(grad, shape=[1, -1]) for grad in grads
            ]
            flatgrads = tf.concat(flatgrads_list, axis=1)
            return flatgrads

        actions = tf.convert_to_tensor(trajectory["a"], dtype=tf.float32)
        states = tf.convert_to_tensor(trajectory["s"], dtype=tf.float32)
        advantages = tf.convert_to_tensor(trajectory["adv"], dtype=tf.float32)

        old_means, old_stdevs = self.policy(states)
        old_logp = compute_logprob(old_means, old_stdevs, actions)

        with tf.GradientTape() as tape:
            new_means, new_stdevs = self.policy(states)
            new_logp = compute_logprob(new_means, new_stdevs, actions)

            loss = tf.exp(new_logp - old_logp) * advantages
            loss = tf.reduce_mean(loss)

        g = tape.gradient(loss, self.policy.trainable_variables)
        g = tf.transpose(flattengrads(g))

        @tf.function
        def hvp_func(vector):
            """Compute hessian-vector product
            """
            with tf.GradientTape() as t2:
                with tf.GradientTape() as t1:
                    new_means, new_stdevs = self.policy(states)
                    kl = compute_kl(old_means, old_stdevs, new_means,
                                    new_stdevs)
                    meankl = tf.reduce_mean(kl)

                kl_grads = t1.gradient(meankl, self.policy.trainable_variables)
                kl_grads = flattengrads(kl_grads)
                grads_vector_product = tf.matmul(kl_grads, vector)

            hvp = t2.gradient(grads_vector_product,
                              self.policy.trainable_variables)
            hvp = tf.transpose(flattengrads(hvp))

            return hvp + vector * 1e-2  #: 共役勾配法の安定化のために微小量を加える

        step_direction = cg(hvp_func, g)

        shs = tf.matmul(tf.transpose(step_direction), hvp_func(step_direction))
        lm = tf.sqrt(2 * self.MAX_KL / shs)
        fullstep = lm * step_direction

        expected_improve = tf.matmul(tf.transpose(g), fullstep)
        fullstep = restore_shape(fullstep, self.policy.trainable_variables)

        params_old = [var.numpy() for var in self.policy.trainable_variables]
        old_loss = loss

        for stepsize in [0.5**i for i in range(10)]:
            params_new = [
                p + step * stepsize for p, step in zip(params_old, fullstep)
            ]
            self.policy.set_weights(params_new)

            new_means, new_stdevs = self.policy(states)
            new_logp = compute_logprob(new_means, new_stdevs, actions)

            new_loss = tf.reduce_mean(tf.exp(new_logp - old_logp) * advantages)
            improve = new_loss - old_loss

            kl = compute_kl(old_means, old_stdevs, new_means, new_stdevs)
            mean_kl = tf.reduce_mean(kl)

            print(f"Expected: {expected_improve} Actual: {improve}")
            print(f"KL {mean_kl}")

            if mean_kl > self.MAX_KL * 1.5:
                print("violated KL constraint. shrinking step.")
            elif improve < 0:
                print("surrogate didn't improve. shrinking step.")
            else:
                print("Stepsize OK!")
                break
        else:
            print("更新に失敗")
            self.policy.set_weights(params_old)

    def update_vf(self, trajectory):

        for _ in range(self.TRAJECTORY_SIZE // self.VF_BATCHSIZE):

            indx = np.random.choice(self.TRAJECTORY_SIZE,
                                    self.VF_BATCHSIZE,
                                    replace=True)

            with tf.GradientTape() as tape:
                vpred = self.value_network(trajectory["s"][indx])
                vtarget = trajectory["vftarget"][indx]
                loss = tf.reduce_mean(tf.square(vtarget - vpred))

            variables = self.value_network.trainable_variables
            grads = tape.gradient(loss, variables)
            self.value_network.optimizer.apply_gradients(zip(grads, variables))

    def save_model(self):

        self.policy.save_weights("checkpoints/actor")

        self.value_network.save_weights("checkpoints/critic")

        print()
        print("Model Saved")
        print()

    def load_model(self):

        self.policy.load_weights("checkpoints/actor")

        self.value_network.load_weights("checkpoints/critic")

    def test_play(self, n, monitordir, load_model=False):

        if load_model:
            self.load_model()

        if monitordir:
            env = wrappers.Monitor(gym.make(self.ENV_ID),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: ep % 1 == 0))
        else:
            env = gym.make(self.ENV_ID)

        for i in range(n):

            total_reward = 0

            steps = 0

            done = False

            state = env.reset()

            while not done:

                action = self.policy.sample_action(state)

                next_state, reward, done, _ = env.step(action)

                state = next_state

                total_reward += reward

                steps += 1

            print()
            print(f"Test Play {i}: {total_reward}")
            print(f"Steps:", steps)
            print()
Exemple #23
0
class SACAgent:
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [0, 250]
        self.obs_dim = env.state_dim
        self.action_dim = env.action_dim

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor([self.action_dim, 1]).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.target_q_net1(next_states, next_actions)
        next_q2 = self.target_q_net2(next_states, next_actions)
        next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi
        expected_q = rewards + (1 - dones) * self.gamma * next_q_target

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update q networks
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy network and target q networks
        new_actions, log_pi = self.policy_net.sample(states)
        if self.update_step % self.delay_step == 0:
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (self.alpha * log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_q_net1.parameters(),
                                           self.q_net1.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_q_net2.parameters(),
                                           self.q_net2.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        # update temperature
        alpha_loss = (self.log_alpha *
                      (-log_pi - self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.alpha = self.log_alpha.exp()

        self.update_step += 1
class SACAgent():
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, critic_lr: float, actor_lr: float,
                 reward_scale: int):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the action
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.reward_scale = reward_scale

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)

        # copy weight parameters to the target Q networks
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(),
                                       lr=self.critic_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(),
                                       lr=self.critic_lr)
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)

    def update(self, batch_size: int):
        # Sampling experiences from the replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # Convert numpy arrays of experience tuples into pytorch tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = self.reward_scale * torch.FloatTensor(rewards).to(
            self.device)  # in SAC we do reward scaling for the sampled rewards
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Critic update (computing the loss)
        # Please refer to equation (6) in the paper for details
        # Sample actions for the next states (s_t+1) using the current policy
        next_actions, next_log_pi, _, _ = self.policy.sample(
            next_states, self.scale)
        next_actions = self.rescale_action(next_actions)

        # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks
        next_q1 = self.target_q_net1(next_states, next_actions)
        next_q2 = self.target_q_net2(next_states, next_actions)
        min_q = torch.min(next_q1,
                          next_q2)  # find minimum between next_q1 and next_q2

        # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi))
        next_q_target = (min_q - next_log_pi)

        # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)

        # Find expected Q, i.e., r(t) + gamma*next_q_target
        expected_q = rewards + (1 - dones) * self.gamma * next_q_target

        # Compute loss between Q network and expected Q
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # Backpropagate the losses and update Q network parameters
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # Policy update (computing the loss)
        # Sample new actions for the current states (s_t) using the current policy
        new_actions, log_pi, _, _ = self.policy.sample(states, self.scale)
        new_actions = self.rescale_action(new_actions)

        # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks
        new_q1 = self.q_net1.forward(states, new_actions)
        new_q2 = self.q_net2.forward(states, new_actions)
        min_q = torch.min(new_q1, new_q2)

        # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7)
        policy_loss = (log_pi - min_q).mean()

        # Backpropagate the losses and update policy network parameters
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Updating target networks with soft update using update rate tau
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

    def get_action(
            self, state: np.ndarray,
            stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]:
        # state: the state input to the pi network
        # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action))
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)

        # Get mean and sigma from the policy network
        mean, log_std = self.policy.forward(state)
        std = log_std.exp()

        # Stochastic mode is used for training, non-stochastic mode is used for evaluation
        if stochastic:
            normal = Normal(mean, std)
            z = normal.sample()
            action = torch.tanh(z)
            action = action.cpu().detach().squeeze(0).numpy()
        else:
            normal = Normal(mean, 0)
            z = normal.sample()
            action = torch.tanh(z)
            action = action.cpu().detach().squeeze(0).numpy()

        # return a rescaled action, and also the mean and standar deviation of the action
        # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        return self.rescale_action(action), mean, std

    def rescale_action(self, action: np.ndarray) -> np.ndarray:
        # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        # scale -> scalar multiplication
        # bias -> scalar offset
        return action * self.scale[0] + self.bias[0]

    def Actor_save(self, WORKSPACE: str):
        # save 각 node별 모델 저장
        print("Save the torch model")
        savePath = WORKSPACE + "./policy_model5_Hop_.pth"
        torch.save(self.policy.state_dict(), savePath)

    def Actor_load(self, WORKSPACE: str):
        # save 각 node별 모델 로드
        print("load the torch model")
        savePath = WORKSPACE + "./policy_model5_Hop_.pth"  # Best
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)
        self.policy.load_state_dict(torch.load(savePath))
Exemple #25
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        #delayed update for policy net and target value nets
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, critic_lr: float, actor_lr: float,
                 reward_scale: int):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the action
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.reward_scale = reward_scale

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)

        # copy weight parameters to the target Q networks
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(),
                                       lr=self.critic_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(),
                                       lr=self.critic_lr)
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)
class PPOAgent:

    GAMMA = 0.99

    GAE_LAMBDA = 0.95

    CLIPRANGE = 0.2

    OPT_ITER = 20

    BATCH_SIZE = 2048

    def __init__(self,
                 env_id,
                 action_space,
                 trajectory_size=256,
                 n_envs=1,
                 max_timesteps=1500):

        self.env_id = env_id

        self.n_envs = n_envs

        self.trajectory_size = trajectory_size

        self.vecenv = VecEnv(env_id=self.env_id,
                             n_envs=self.n_envs,
                             max_timesteps=max_timesteps)

        self.policy = PolicyNetwork(action_space=action_space)

        self.old_policy = PolicyNetwork(action_space=action_space)

        self.critic = CriticNetwork()

        self.r_running_stats = util.RunningStats(shape=(action_space, ))

        self._init_network()

    def _init_network(self):

        env = gym.make(self.env_id)

        state = np.atleast_2d(env.reset())

        self.policy(state)

        self.old_policy(state)

    def run(self, n_updates, logdir):

        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        history = {"steps": [], "scores": []}

        states = self.vecenv.reset()

        hiscore = None

        for epoch in range(n_updates):

            for _ in range(self.trajectory_size):

                actions = self.policy.sample_action(states)

                next_states = self.vecenv.step(actions)

                states = next_states

            trajectories = self.vecenv.get_trajectories()

            for trajectory in trajectories:
                self.r_running_stats.update(trajectory["r"])

            trajectories = self.compute_advantage(trajectories)

            states, actions, advantages, vtargs = self.create_minibatch(
                trajectories)

            vloss = self.update_critic(states, vtargs)

            self.update_policy(states, actions, advantages)

            global_steps = (epoch + 1) * self.trajectory_size * self.n_envs
            train_scores = np.array([traj["r"].sum() for traj in trajectories])

            if epoch % 1 == 0:
                test_scores, total_steps = self.play(n=1)
                test_scores, total_steps = np.array(test_scores), np.array(
                    total_steps)
                history["steps"].append(global_steps)
                history["scores"].append(test_scores.mean())
                ma_score = sum(history["scores"][-10:]) / 10
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score",
                                      test_scores.mean(),
                                      step=epoch)
                    tf.summary.scalar("test_steps",
                                      total_steps.mean(),
                                      step=epoch)
                print(
                    f"Epoch {epoch}, {global_steps//1000}K, {test_scores.mean()}"
                )

            if epoch // 10 > 10 and (hiscore is None or ma_score > hiscore):
                self.save_model()
                hiscore = ma_score
                print("Model Saved")

            with self.summary_writer.as_default():
                tf.summary.scalar("value_loss", vloss, step=epoch)
                tf.summary.scalar("train_score",
                                  train_scores.mean(),
                                  step=epoch)

        return history

    def compute_advantage(self, trajectories):
        """
            Generalized Advantage Estimation (GAE, 2016)
        """

        for trajectory in trajectories:

            trajectory["v_pred"] = self.critic(trajectory["s"]).numpy()

            trajectory["v_pred_next"] = self.critic(trajectory["s2"]).numpy()

            is_nonterminals = 1 - trajectory["done"]

            normed_rewards = (trajectory["r"] /
                              (np.sqrt(self.r_running_stats.var) + 1e-4))

            deltas = normed_rewards + self.GAMMA * is_nonterminals * trajectory[
                "v_pred_next"] - trajectory["v_pred"]

            advantages = np.zeros_like(deltas, dtype=np.float32)

            lastgae = 0
            for i in reversed(range(len(deltas))):
                lastgae = deltas[
                    i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[
                        i] * lastgae
                advantages[i] = lastgae

            trajectory["advantage"] = advantages

            trajectory["R"] = advantages + trajectory["v_pred"]

        return trajectories

    def update_policy(self, states, actions, advantages):

        self.old_policy.set_weights(self.policy.get_weights())

        indices = np.random.choice(range(states.shape[0]),
                                   (self.OPT_ITER, self.BATCH_SIZE))

        for i in range(self.OPT_ITER):

            idx = indices[i]

            old_means, old_stdevs = self.old_policy(states[idx])

            old_logprob = self.compute_logprob(old_means, old_stdevs,
                                               actions[idx])

            with tf.GradientTape() as tape:

                new_means, new_stdevs = self.policy(states[idx])

                new_logprob = self.compute_logprob(new_means, new_stdevs,
                                                   actions[idx])

                ratio = tf.exp(new_logprob - old_logprob)

                ratio_clipped = tf.clip_by_value(ratio, 1 - self.CLIPRANGE,
                                                 1 + self.CLIPRANGE)

                loss_unclipped = ratio * advantages[idx]

                loss_clipped = ratio_clipped * advantages[idx]

                loss = tf.minimum(loss_unclipped, loss_clipped)

                loss = -1 * tf.reduce_mean(loss)

            grads = tape.gradient(loss, self.policy.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 0.5)
            self.policy.optimizer.apply_gradients(
                zip(grads, self.policy.trainable_variables))

    def update_critic(self, states, v_targs):

        losses = []

        indices = np.random.choice(range(states.shape[0]),
                                   (self.OPT_ITER, self.BATCH_SIZE))

        for i in range(self.OPT_ITER):

            idx = indices[i]

            old_vpred = self.critic(states[idx])

            with tf.GradientTape() as tape:

                vpred = self.critic(states[idx])

                vpred_clipped = old_vpred + tf.clip_by_value(
                    vpred - old_vpred, -self.CLIPRANGE, self.CLIPRANGE)

                loss = tf.maximum(tf.square(v_targs[idx] - vpred),
                                  tf.square(v_targs[idx] - vpred_clipped))

                loss = tf.reduce_mean(loss)

            grads = tape.gradient(loss, self.critic.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 0.5)
            self.critic.optimizer.apply_gradients(
                zip(grads, self.critic.trainable_variables))

            losses.append(loss)

        return np.array(losses).mean()

    @tf.function
    def compute_logprob(self, means, stdevs, actions):
        """ガウス分布の確率密度関数よりlogp(x)を計算
            logp(x) = -0.5 log(2π) - log(std)  -0.5 * ((x - mean) / std )^2
        """
        logprob = -0.5 * np.log(2 * np.pi)
        logprob += -tf.math.log(stdevs)
        logprob += -0.5 * tf.square((actions - means) / stdevs)
        logprob = tf.reduce_sum(logprob, axis=1, keepdims=True)
        return logprob

    def create_minibatch(self, trajectories):

        states = np.vstack([traj["s"] for traj in trajectories])
        actions = np.vstack([traj["a"] for traj in trajectories])

        advantages = np.vstack([traj["advantage"] for traj in trajectories])

        v_targs = np.vstack([traj["R"] for traj in trajectories])

        return states, actions, advantages, v_targs

    def save_model(self):

        self.policy.save_weights("checkpoints/policy")

        self.critic.save_weights("checkpoints/critic")

    def load_model(self):

        self.policy.load_weights("checkpoints/policy")

        self.critic.load_weights("checkpoints/critic")

    def play(self, n=1, monitordir=None, verbose=False):

        if monitordir:
            env = wrappers.Monitor(gym.make(self.env_id),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_id)

        total_rewards = []
        total_steps = []

        for _ in range(n):

            state = env.reset()

            done = False

            total_reward = 0

            steps = 0

            while not done:

                steps += 1

                action = self.policy.sample_action(state)

                next_state, reward, done, _ = env.step(action[0])

                if verbose:
                    mean, sd = self.policy(np.atleast_2d(state))
                    print(mean, sd)
                    print(reward)

                total_reward += reward

                if done:
                    break
                else:
                    state = next_state

            total_rewards.append(total_reward)
            total_steps.append(steps)
            print()
            print(total_reward, steps)
            print()

        return total_rewards, total_steps
Exemple #28
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.firsttime = 0

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        #self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]  #1

        self.conv_channels = 4
        self.kernel_size = (3, 3)

        self.img_size = (500, 500, 3)

        print("Diagnostics:")
        print(f"action_range: {self.action_range}")
        #print(f"obs_dim: {self.obs_dim}")
        print(f"action_dim: {self.action_dim}")

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.feature_net = FeatureExtractor(self.img_size[2],
                                            self.conv_channels,
                                            self.kernel_size).to(self.device)
        print("Feature net init'd successfully")

        input_dim = self.feature_net.get_output_size(self.img_size)
        self.input_size = input_dim[0] * input_dim[1] * input_dim[2]
        print(f"input_size: {self.input_size}")

        self.value_net = ValueNetwork(self.input_size, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.input_size,
                                             1).to(self.device)
        self.q_net1 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.input_size,
                                        self.action_dim).to(self.device)

        print("Finished initing all nets")

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        print("Finished copying targets")

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        print("Finished initing optimizers")

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        print("End of init")

    def get_action(self, state):
        if state.shape != self.img_size:
            print(
                f"Invalid size, expected shape {self.img_size}, got {state.shape}"
            )
            return None

        inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to(
            self.device)
        features = self.feature_net(inp)
        features = features.view(-1, self.input_size)

        mean, log_std = self.policy_net.forward(features)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # states and next states are lists of ndarrays, np.stack converts them to
        # ndarrays of shape (batch_size, height, width, num_channels)
        states = np.stack(states)
        next_states = np.stack(next_states)

        states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).permute(0, 3, 1,
                                                             2).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Process images
        features = self.feature_net(
            states)  #.contiguous() # Properly shaped due to batching
        next_features = self.feature_net(next_states)  #.contiguous()

        features = torch.reshape(features, (64, self.input_size))
        next_features = torch.reshape(next_features, (64, self.input_size))

        next_actions, next_log_pi = self.policy_net.sample(next_features)
        next_q1 = self.q_net1(next_features, next_actions)
        next_q2 = self.q_net2(next_features, next_actions)
        next_v = self.target_value_net(next_features)

        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(features)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        curr_q1 = self.q_net1.forward(features, actions)
        curr_q2 = self.q_net2.forward(features, actions)
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward(retain_graph=True)
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward(retain_graph=True)
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward(retain_graph=True)
        self.q2_optimizer.step()

        # delayed update for policy network and target q networks
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(features)
            min_q = torch.min(self.q_net1.forward(features, new_actions),
                              self.q_net2.forward(features, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward(retain_graph=True)
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1