コード例 #1
0
class A2C(Model):
    def __init__(self,
                 scope_name,
                 env,
                 states_shape,
                 n_actions,
                 channel_min,
                 channel_rate=2):
        super(A2C, self).__init__(scope_name)
        self.scope_name = scope_name
        self.channel_min = channel_min
        self.channel_rate = channel_rate
        self.states_shape = states_shape
        self.n_actions = n_actions
        self.env = env

    def buind(self, action_lr=1e-5, critic_lr=1e-5):
        self.actor = Actor('Actor',
                           env=self.env,
                           states_shape=self.states_shape,
                           n_actions=self.n_actions)
        self.actor.build(lr=action_lr)

        self.critic = Critic('Critic',
                             env=self.env,
                             states_shape=self.states_shape,
                             n_features=self.n_actions)
        self.critic.build(lr=critic_lr)

    def predict(self, z):
        session = tf.get_default_session()
        feed_dict = {self.input_z: z}
        output = session.run(self.fake_img, feed_dict=feed_dict)
        return output

    def train(self, datas):
        session = tf.get_default_session()
        td_error = self.critic.train(
            datas.states, datas.rewards,
            datas.conditions)  # gradient = grad[r + gamma * V(s_) - V(s)]
        self.actor.train(
            datas.states, datas.actions, td_error,
            datas.conditions)  # true_gradient = grad[logPi(s,a) * td_error]
コード例 #2
0
class DDPG(tf.keras.Model):
    """ DDPG model - continuous action space case
    Args:
        input_dim: shape of input
        action_dim: shape of action
        action_scale: (minimum value of action, maximum value of action)
        memory_size : size of replay memory.
        gamma : discount rate
        tau: parameter for soft update
        learning_rate_actor: learning rate for actor network
        learning_rate_critic: learning rate for critic network
        device_name : name of device(normally cpu:0 or gpu:0)
    """
    def __init__(self,
                 input_dim,
                 action_dim,
                 action_scale,
                 memory_size,
                 gamma,
                 tau,
                 learning_rate_actor=1e-3,
                 learning_rate_critic=1e-3,
                 device_name="cpu:0",
                 checkpoint_directory="ckpt/"):
        super(DDPG, self).__init__()
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.action_scale = action_scale
        self.memory_size = memory_size
        self.replay_memory = ReplayMemory(memory_size)
        self.gamma = gamma
        self.tau = tau
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.device_name = device_name

        self.checkpoint_directory = checkpoint_directory
        if not os.path.exists(self.checkpoint_directory):
            os.makedirs(self.checkpoint_directory)

        # actor
        self.actor_active = Actor(self.input_dim,
                                  self.action_dim,
                                  self.action_scale,
                                  name="actor_active")
        self.actor_target = Actor(self.input_dim,
                                  self.action_dim,
                                  self.action_scale,
                                  name="actor_target")
        self.actor_target.trainable = False

        # critic
        self.critic_active = Critic(self.input_dim,
                                    self.action_dim,
                                    name="critic_active")
        self.critic_target = Critic(self.input_dim,
                                    self.action_dim,
                                    name="critic_target")
        self.critic_target.trainable = False

        # optimizer
        self.optimizer_actor = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate_actor)
        self.optimizer_critic = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate_critic)

        # logging
        self.global_step = 0

    def build(self):
        self.actor_active.build()
        self.actor_target.build()
        self.critic_active.build()
        self.critic_target.build()
        self.built = True

    def get_action(self, x):
        """ get action from features
        Args:
            x : input(state) features shape of  input_dim (without batch)
        Returns:
            best action actor network
        """
        return self.actor_active.get_action(x)

    def loss_critic(self, X, action, reward, X_next, done):
        """ get critic loss of training batch
        Args:
            X : input features batch, shape of (batch_size, input_shape)
            action : actions batch, shape of (batch_size, action_dim)
            r : reward batch, shape of (batch_size, 1)
            X_next : next_state features, shape of (batch_size, input_shape)
            done : done signal batch, shape of (batch_size, 1)
        Returns:
            mean squared error for critic q networks
        """
        # calculate target y-value
        done_0 = 1 - done  # toggle done(0.0, 1.0) to (1.0, 0.0)
        next_action = self.actor_target(X_next)
        q_targets_next = self.critic_target(X_next, next_action)
        expected_next_return = q_targets_next * done_0
        y = reward + (self.gamma * expected_next_return)
        # calculate active q-value
        q_active = self.critic_active(X, action)

        loss_val = tf.losses.mean_squared_error(labels=q_active, predictions=y)

        return loss_val

    def grad_critic(self, X, action, reward, X_next, done):
        """ get gradient of training batch
        Args:
            X : input features batch, shape of (batch_size, input_shape)
            action : actions batch, shape of (batch_size, action_dim)
            r : reward batch, shape of (batch_size, 1)
            X_next : next_state features, shape of (batch_size, input_shape)
            done : done signal batch, shape of (batch_size, 1)
        Returns:
            (gradient of critic variables, loss of batch)
        """
        with tfe.GradientTape() as tape:
            loss_val = self.loss_critic(X, action, reward, X_next, done)

        return tape.gradient(loss_val, self.critic_active.variables), loss_val

    def loss_actor(self, X):
        """ get actor loss of training batch
        Args:
            X : input features batch, shape of (batch_size, input_shape)
        Returns:
            -1 * mean q value of policy
        """
        q_active = self.critic_active(X, self.actor_active(X))
        loss_val = -1 * tf.reduce_mean(q_active, axis=0)

        return loss_val

    def grad_actor(self, X):
        """ get gradient of training batch
        Args:
            X : input features batch, shape of (batch_size, input_shape)
        Returns:
            (gradient of actor variables, loss of batch)
        """
        with tfe.GradientTape() as tape:
            loss_val = self.loss_actor(X)

        return tape.gradient(loss_val, self.actor_active.variables), loss_val

    def train(self, X, action, reward, X_next, done):
        """ train mini-batch one step
        Args:
            X : input features batch, shape of (batch_size, Fx, Fy, features)
            action : actions batch, shape of (batch_size, 1)
            reward : reward batch, shape of (batch_size, 1)
            X_next : next_state features, shape of (batch_size, Fx, Fy, features)
            done : done signal batch, shape of (batch_size, 1)
        """
        with tf.device(self.device_name):
            self.global_step += 1
            grads_critic, loss_critic = self.grad_critic(
                tf.convert_to_tensor(X), tf.convert_to_tensor(action),
                tf.convert_to_tensor(reward), tf.convert_to_tensor(X_next),
                tf.convert_to_tensor(done))
            self.optimizer_critic.apply_gradients(
                zip(grads_critic, self.critic_active.variables))

            grads_actor, loss_actor = self.grad_actor(tf.convert_to_tensor(X))
            self.optimizer_actor.apply_gradients(
                zip(grads_actor, self.actor_active.variables))

            update_target_weights(self.critic_target.variables,
                                  self.critic_active.variables, self.tau)
            update_target_weights(self.actor_target.variables,
                                  self.actor_active.variables, self.tau)
            return loss_critic, loss_actor

    def save(self):
        """ save current weight of layers
        """
        tfe.Saver(self.variables).save(self.checkpoint_directory,
                                       global_step=self.global_step)
        print("saved step %d in %s" %
              (self.global_step, self.checkpoint_directory))

    def load(self, global_step="latest"):
        """ load saved weights
        Args:
            global_step : load specific step, if "latest" load latest one
        """
        self.build()
        saver = tfe.Saver(self.variables)
        if global_step == "latest":
            saver.restore(tf.train.latest_checkpoint(
                self.checkpoint_directory))
            self.global_step = int(
                tf.train.latest_checkpoint(
                    self.checkpoint_directory).split('/')[-1][1:])
        else:
            saver.restore(self.checkpoint_directory + "-" + str(global_step))
            self.global_step = global_step