コード例 #1
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.name = "DDPG"
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 'actor_local')
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  'actor_target')

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   'critic_local')
        self.critic_target = Critic(self.state_size, self.action_size,
                                    'critic_target')

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Reward counter
        self.total_reward = 0
        self.n_steps = 0

    def load(self):
        self.actor_local.load()
        self.actor_target.load()
        self.critic_local.load()
        self.critic_target.load()
        print("Agent's weights loaded from disk.")

    def save(self):
        self.actor_local.save()
        self.actor_target.save()
        self.critic_local.save()
        self.critic_target.save()
        print("Agent's weights saved to disk.")

    def reset_episode(self):
        self.total_reward = 0
        self.n_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Add reward to total
        self.total_reward += reward
        self.n_steps += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, add_noise=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        # Hack, rescale rotor revs to +-5 range from average
        # rev_mean = np.mean(action)
        # action = (action-450)/450
        # action *= 50
        # action += rev_mean

        if add_noise:
            action += self.noise.sample()  # additive noise for exploration
        return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #2
0
class CAC(object):
    def __init__(self,
                 a_dim,
                 s_dim,
                 variant,
                 action_prior='uniform',
                 max_global_steps=100000):
        """
        a_dim : dimension of action space
        s_dim: state space dimension
        variant: dictionary containing parameters for the algorithms
        """
        ###############################  Model parameters  ####################################
        set_seed(variant['seed'])
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.actor = Actor(input_dim=s_dim,
                           output_dim=a_dim,
                           n_layers=3,
                           layer_sizes=[256, 256, 256],
                           hidden_activation="leakyrelu").to(self.device)
        self.actor_target = Actor(input_dim=s_dim,
                                  output_dim=a_dim,
                                  n_layers=3,
                                  layer_sizes=[256, 256, 256],
                                  hidden_activation="leakyrelu").to(
                                      self.device).eval()
        self.critic = LyapunovCritic(state_dim=s_dim,
                                     action_dim=a_dim,
                                     output_dim=None,
                                     n_layers=2,
                                     layer_sizes=[256, 256],
                                     hidden_activation="leakyrelu").to(
                                         self.device)
        self.critic_target = LyapunovCritic(state_dim=s_dim,
                                            action_dim=a_dim,
                                            output_dim=None,
                                            n_layers=2,
                                            layer_sizes=[256, 256],
                                            hidden_activation="leakyrelu").to(
                                                self.device).eval()

        # copy parameters of the learning network to the target network
        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)
        # disable gradient calculations of the target network
        stop_grad(self.critic_target)
        stop_grad(self.actor_target)
        # self.memory_capacity = variant['memory_capacity']

        ################################ parameters for training ###############################
        self.batch_size = variant[
            'batch_size']  # batch size for learning the actor
        self.gamma = variant['gamma']  # discount factor
        self.tau = variant['tau']  # smoothing parameter for the weight updates
        self.approx_value = True if 'approx_value' not in variant.keys(
        ) else variant['approx_value']
        self._action_prior = action_prior  # prior over action space
        s_dim = s_dim * (variant['history_horizon'] + 1)
        self.a_dim, self.s_dim, = a_dim, s_dim
        self.history_horizon = variant[
            'history_horizon']  # horizon to consider for the history
        self.working_memory = deque(maxlen=variant['history_horizon'] +
                                    1)  # memory to store history
        target_entropy = variant['target_entropy']
        if target_entropy is None:
            self.target_entropy = -self.a_dim  #lower bound of the policy entropy
        else:
            self.target_entropy = target_entropy
        self.target_variance = 0.0
        self.finite_horizon = variant['finite_horizon']
        self.soft_predict_horizon = variant['soft_predict_horizon']
        self.use_lyapunov = variant['use_lyapunov']
        self.adaptive_alpha = variant['adaptive_alpha']
        self.adaptive_beta = variant[
            'adaptive_beta'] if 'adaptive_beta' in variant.keys() else False
        self.time_near = variant['Time_near']
        self.max_global_steps = max_global_steps
        self.LR_A = variant['lr_a']
        self.LR_L = variant['lr_l']
        self.LR_lag = self.LR_A / 10
        self.alpha3 = variant['alpha3']

        labda = variant['labda']  # formula (12) in the paper
        alpha = variant['alpha']  # entropy temperature (beta in the paper)
        beta = variant['beta']  # constraint error weight

        self.log_labda = torch.log(torch.tensor([labda], device=self.device))
        self.log_alpha = torch.log(torch.tensor(
            [alpha], device=self.device))  # Entropy Temperature
        self.log_beta = torch.log(torch.tensor([beta], device=self.device))
        self.log_alpha.requires_grad = True
        self.log_beta.requires_grad = True
        self.log_labda.requires_grad = True
        # The update is in log space
        self.labda = torch.clamp(torch.exp(self.log_labda),
                                 min=SCALE_lambda_MIN_MAX[0],
                                 max=SCALE_lambda_MIN_MAX[1])
        self.alpha = torch.exp(self.log_alpha)
        self.beta = torch.clamp(torch.exp(self.log_beta),
                                min=SCALE_beta_MIN_MAX[0],
                                max=SCALE_beta_MIN_MAX[1])

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.LR_A)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.LR_L)
        self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.LR_A)
        self.labda_optim = torch.optim.Adam([self.log_labda], lr=self.LR_lag)
        self.beta_optim = torch.optim.Adam([self.log_beta], lr=0.01)

        # step_fn = lambda i : 1.0 - (i - 1.)/self.max_global_steps
        # self.actor_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.actor_optim, lr_lambda = step_fn)
        # self.critic_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.critic_optim, lr_lambda = step_fn)
        # self.alpha_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.alpha_optim, lr_lambda = step_fn)
        # self.labda_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.labda_optim, lr_lambda = step_fn)
        # self.beta_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.beta_optim, lr_lambda = step_fn)

        self.actor.float()
        self.critic.float()

    def act(self, s, evaluation=False):
        a, deterministic_a, _, _ = self.actor(s)
        if evaluation is True:
            return deterministic_a
        else:
            return a

    def learn(self, batch):

        bs = torch.tensor(batch['s'],
                          dtype=torch.float).to(self.device)  # state
        ba = torch.tensor(batch['a'],
                          dtype=torch.float).to(self.device)  # action
        br = torch.tensor(batch['r'],
                          dtype=torch.float).to(self.device)  # reward
        bterminal = torch.tensor(batch['terminal'],
                                 dtype=torch.float).to(self.device)
        bs_ = torch.tensor(batch['s_'],
                           dtype=torch.float).to(self.device)  # next state
        b_s = torch.tensor(batch['_s'],
                           dtype=torch.float).to(self.device)  # prev state
        bv = None
        b_r_ = None
        # print(bs)
        alpha_loss = None
        beta_loss = None

        # # beta learning
        # self.beta_optim.zero_grad()
        # beta_loss = self.get_beta_loss(b_s)
        # if self.adaptive_beta:
        #     beta_loss.backward(retain_graph = False)
        #     self.beta_optim.step()
        # else:
        #     self.beta_optim.zero_grad()

        # lyapunov learning
        start_grad(self.critic)
        if self.finite_horizon:
            bv = torch.tensor(batch['value'])
            b_r_ = torch.tensor(batch['r_N_'])

        self.critic_optim.zero_grad()
        critic_loss = self.get_lyapunov_loss(bs, bs_, ba, br, b_r_, bv,
                                             bterminal)
        critic_loss.backward()
        self.critic_optim.step()

        # actor lerning
        stop_grad(self.critic)
        self.actor_optim.zero_grad()
        actor_loss = self.get_actor_loss(bs, bs_, ba, br)
        actor_loss.backward(retain_graph=False)
        self.actor_optim.step()

        # alpha learning
        if self.adaptive_alpha:
            self.alpha_optim.zero_grad()
            alpha_loss = self.get_alpha_loss(bs, self.target_entropy)
            alpha_loss.backward(retain_graph=False)
            self.alpha_optim.step()
            self.alpha = torch.exp(self.log_alpha)
        # labda learning
        self.labda_optim.zero_grad()
        labda_loss = self.get_labda_loss(br, bs, bs_, ba)
        # print("labda loss = ", labda_loss)
        labda_loss.backward(retain_graph=False)
        self.labda_optim.step()
        self.labda = torch.clamp(torch.exp(self.log_labda),
                                 min=SCALE_lambda_MIN_MAX[0],
                                 max=SCALE_lambda_MIN_MAX[1])

        # update target networks
        soft_update(self.critic_target, self.critic, self.tau)
        soft_update(self.actor_target, self.actor, self.tau)
        return alpha_loss, beta_loss, labda_loss, actor_loss, critic_loss

    def get_alpha_loss(self, s, target_entropy):

        # with torch.no_grad():
        #     _, self.deterministic_a,self.log_pis, _ = self.actor_target(s)
        intermediate = (self.log_pis + target_entropy).detach()
        # self.a, self.deterministic_a, self.log_pis, _ = self.actor(s)
        # print(self.a)

        return -torch.mean(self.log_alpha * intermediate)

    def get_labda_loss(self, r, s, s_, a):
        # with torch.no_grad():
        #     l = self.critic(s, a)
        #     lya_a_, _, _, _ = self.actor_target(s_)
        #     self.l_ = self.critic_target(s_, lya_a_)
        l = self.l.detach()
        lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r)
        return -torch.mean(self.log_labda * lyapunov_loss)

    def get_beta_loss(self, _s):
        with torch.no_grad():
            _, _deterministic_a, _, _ = self.actor_target(_s)
        self.l_action = torch.mean(
            torch.norm(_deterministic_a.detach() - self.deterministic_a,
                       dim=1))
        with torch.no_grad():
            intermediate = (self.l_action - 0.02).detach()
        return -torch.mean(self.log_beta * intermediate)

    def get_actor_loss(self, s, s_, a, r):
        if self._action_prior == 'normal':
            policy_prior = torch.distributions.MultivariateNormal(
                loc=torch.zeros(self.a_dim),
                covariance_matrix=torch.diag(torch.ones(self.a_dim)))
            policy_prior_log_probs = policy_prior.log_prob(self.a)
        elif self._action_prior == 'uniform':
            policy_prior_log_probs = 0.0

        # only actor weights are updated!
        _, self.deterministic_a, self.log_pis, _ = self.actor(s)
        # self.l = self.critic(s, a)
        with torch.no_grad():
            # self.l = self.critic(s, a)
            lya_a_, _, _, _ = self.actor(s_)
            self.l_ = self.critic(s_, lya_a_)
        l = self.l.detach()
        self.lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r)
        labda = self.labda.detach()
        alpha = self.alpha.detach()
        a_loss = labda * self.lyapunov_loss + alpha * torch.mean(
            self.log_pis) - policy_prior_log_probs
        return a_loss

    def get_lyapunov_loss(self, s, s_, a, r, r_n_=None, v=None, terminal=0.):
        with torch.no_grad():
            a_, _, _, _ = self.actor_target(s_)
            l_ = self.critic_target(s_, a_)
        self.l = self.critic(s, a)
        if self.approx_value:
            if self.finite_horizon:
                if self.soft_predict_horizon:
                    l_target = r - r_n_ + l_
                else:
                    l_target = v
            else:
                l_target = r + self.gamma * (
                    1 - terminal
                ) * l_  # Lyapunov critic - self.alpha * next_log_pis
        else:
            l_target = r
        mse_loss = nn.MSELoss()
        l_loss = mse_loss(self.l, l_target)

        return l_loss

    def save_result(self, path):
        if not os.path.exists(path + "/policy/"):
            os.mkdir(path + "/policy/")
        self.actor_target.save(path + "/policy/actor_target.pth")
        self.critic_target.save(path + "/policy/critic_target.pth")
        self.actor.save(path + "/policy/actor.pth")
        self.critic.save(path + "/policy/critic.pth")
        print("Save to path: ", path + "/policy/")

    def restore(self, path):
        result_path = path
        if not os.path.exists(result_path):
            raise IOError("Results path ", result_path,
                          " does not contain anything to load")
        self.actor_target.load(result_path + "/actor_target.pth")
        self.critic_target.load(result_path + "/critic_target.pth")
        self.actor.load(result_path + "/actor.pth")
        self.critic.load(result_path + "/critic.pth")
        success_load = True
        print("Load successful, model file from ", result_path)
        print("#########################################################")
        return success_load

    def scheduler_step(self):
        self.alpha_scheduler.step()
        self.beta_scheduler.step()
        self.labda_scheduler.step()
        self.actor_scheduler.step()
        self.critic_scheduler.step()