Exemple #1
0
    def __init__(self, actor, critic, target_estimator, discount_factor,
                 actor_weight, actor_mean):
        """

        :param actor:
        :type actor network.NetworkFunction
        :param critic:
        :type critic network.NetworkFunction
        :param target_estimator:
        :type target_estimator: target_estimate.TargetEstimator
        """
        super(NoisyDPGUpdater, self).__init__(actor, critic, target_estimator,
                                              discount_factor, actor_weight)
        self._actor_mean = actor_mean
        with tf.name_scope("NoisyDPGUpdater"):
            with tf.name_scope("action_mean"):
                self._input_action_mean_gradient = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, self._dim_action],
                    name="input_action_mean_gradient")
                self._actor_mean_loss = tf.reduce_sum(
                    actor_mean.output().op * self._input_action_mean_gradient,
                    axis=1)
                self._actor_mean_loss = -tf.reduce_mean(self._actor_mean_loss)
            self._op_loss = (self._actor_loss + self._actor_mean_loss
                             ) * actor_weight + self._critic_loss
            # self._op_loss = self._actor_mean_loss * actor_weight + self._critic_loss

        self._update_operation = network.MinimizeLoss(
            self._op_loss,
            var_list=self._actor.variables + self._critic.variables +
            self._actor_mean.variables)
Exemple #2
0
    def __init__(self,
                 rollout_dist,
                 rollout_action_function,
                 pi_function,
                 entropy=1e-3):
        """
        Policy Net updater
        calculate the loss between action derived from A3C and the policy net

        :param rollout_action:
        :param pi_function:
        :param entropy:
        """
        super(PolicyNetUpdater, self).__init__()
        self._rollout_dist, self._pi_function, self._rollout_action_function = rollout_dist, pi_function, rollout_action_function
        self._entropy = entropy
        with tf.name_scope("PolicyNetUpdater"):
            with tf.name_scope("input"):
                self._input_action = self._rollout_dist.input_sample()

            op_pi = self._pi_function.output().op
            op_mimic_pi = self._rollout_action_function.output().op

            with tf.name_scope("rollout"):
                self._rollout_loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        labels=op_pi, logits=op_mimic_pi),
                    name="rollout_loss")
                self._entropy_loss = self._rollout_action_function
            self._op_loss = self._rollout_loss

        self._update_operation = network.MinimizeLoss(
            self._op_loss, var_list=self._rollout_action_function.variables)
Exemple #3
0
    def __init__(self, net_se, func, stddev=1.0, stddev_weight=1e-3):
        super(CenterDisentangleUpdater, self).__init__()
        self._stddev = stddev
        state_shape = net_se.inputs[0].shape.as_list()
        se_dimension = net_se["se"].op.shape.as_list()[-1]
        noise_shape = func.inputs[1].shape.as_list()
        with tf.name_scope("input"):
            self._input_state = tf.placeholder(dtype=tf.float32,
                                               shape=state_shape,
                                               name="St")
            self._input_noise = tf.placeholder(dtype=tf.float32,
                                               shape=noise_shape,
                                               name="Nt")
            self._input_stddev = tf.placeholder(dtype=tf.float32,
                                                name="stddev")
        with tf.name_scope("disentangle"):
            net_se_off = net_se([self._input_state], "off_se")
            net_noise_off = func(
                [tf.stop_gradient(net_se_off["se"].op), self._input_noise],
                "off_noise")
            self._noise_op = net_noise_off["noise"].op

            mean = tf.reduce_mean(self._noise_op, axis=0, keep_dims=True)
            mean_loss = tf.reduce_sum(Utils.clipped_square(mean))
            stddev = tf.reduce_mean(
                tf.sqrt(
                    tf.reduce_sum(tf.square(self._noise_op - mean), axis=-1)))
            stddev_loss = Utils.clipped_square(stddev - self._input_stddev *
                                               np.sqrt(se_dimension))
            self._op_loss = mean_loss + stddev_loss * stddev_weight
            self._mean_op, self._stddev_op, self._mean_loss, self._stddev_loss = \
                mean, stddev, mean_loss, stddev_loss
        self._update_operation = network.MinimizeLoss(self._op_loss,
                                                      var_list=func.variables)
Exemple #4
0
    def __init__(self, inverse_function, policy_dist):
        super(InverseUpdater, self).__init__()
        self._inverse_function, self._policy_dist = inverse_function, policy_dist

        with tf.name_scope("InverseUpdater"):
            with tf.name_scope("input"):
                self._input_action = policy_dist.input_sample()

            op_action_hat = inverse_function.output().op

            # inverse loss calculation
            with tf.name_scope("inverse"):
                depth = np.shape(op_action_hat)[1]
                inverse_loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        labels=tf.one_hot(indices=self._input_action,
                                          depth=depth,
                                          on_value=1,
                                          off_value=0,
                                          axis=-1),
                        logits=op_action_hat))
                # inverse_loss = tf.reduce_mean(
                #     tf.square(
                #         op_action_hat -
                #         tf.one_hot(indices=self._input_action, depth=depth, on_value=1.0, off_value=0.0, axis=-1))
                #     )
                self._inverse_loss = inverse_loss

            self._op_loss = self._inverse_loss

        self._update_operation = network.MinimizeLoss(
            self._op_loss,
            var_list=self._inverse_function.variables +
            self._policy_dist._dist_function.variables)
Exemple #5
0
    def __init__(self,
                 policy_dist,
                 v_function,
                 target_estimator,
                 entropy=1e-3,
                 actor_weight=1.0):
        """
        Actor Critic methods, for both continuous and discrete action spaces.

        :param policy_dist:
        :type policy_dist: distribution.NNDistribution
        :param v_function: Function calculating state value
        :type v_function: network.NetworkFunction
        :param target_estimator:
        :type target_estimator:
        :param num_actions:
        """
        super(ActorCriticUpdater, self).__init__()
        self._policy_dist, self._v_function = policy_dist, v_function
        self._target_estimator = target_estimator
        self._entropy = entropy
        with tf.name_scope("ActorCriticUpdater"):
            with tf.name_scope("input"):
                self._input_target_v = tf.placeholder(dtype=tf.float32,
                                                      shape=[None],
                                                      name="input_target_v")
                self._input_action = policy_dist.input_sample()
                self._input_entropy = tf.placeholder(dtype=tf.float32,
                                                     shape=[],
                                                     name="input_entropy")
            op_v = v_function.output().op
            with tf.name_scope("value"):
                td = self._input_target_v - op_v
                self._q_loss = tf.reduce_mean(network.Utils.clipped_square(td))
            with tf.name_scope("policy"):
                advantage = self._input_target_v - op_v
                self._advantage = advantage
                _mean, _var = tf.nn.moments(advantage, axes=[0])
                self._std_advantage = advantage / (tf.sqrt(_var) + 1.0)
                # self._std_advantage = self._advantage
                pi_loss = tf.reduce_mean(self._policy_dist.log_prob() *
                                         tf.stop_gradient(self._std_advantage))
                entropy_loss = tf.reduce_mean(self._input_entropy *
                                              self._policy_dist.entropy())
                self._pi_loss = pi_loss
            # self._op_loss = self._q_loss - (self._pi_loss + entropy_loss)
            self._op_loss = self._q_loss
            print "advantage, self._policy_dist.entropy(), self._policy_dist.log_prob()", advantage, self._policy_dist.entropy(
            ), self._policy_dist.log_prob()
        self._update_operation = network.MinimizeLoss(
            self._op_loss,
            var_list=self._v_function.variables +
            self._policy_dist._dist_function.variables)
Exemple #6
0
 def __init__(self,
              policy_dist,
              v_function,
              target_estimator,
              entropy=1e-3,
              max_advantage=10.0):
     """
     :param policy_dist:
     :type policy_dist: distribution.DiscreteDistribution
     :param v_function:
     :type v_function: network.NetworkFunction
     :param target_estimator:
     :type target_estimator:
     """
     super(DiscretePGUpdater, self).__init__()
     self._policy_dist, self._v_function = policy_dist, v_function
     self._target_estimator = target_estimator
     self._entropy = entropy
     self._num_actions = v_function.output().op.shape.as_list()[-1]
     with tf.name_scope("DiscreteActorCriticUpdate"):
         with tf.name_scope("input"):
             self._input_target_v = tf.placeholder(dtype=tf.float32,
                                                   shape=[None],
                                                   name="input_target_q")
             self._input_action = policy_dist.input_sample()
             self._input_entropy = tf.placeholder(dtype=tf.float32,
                                                  shape=[],
                                                  name="input_entropy")
         op_v = v_function.output().op
         with tf.name_scope("policy"):
             advantage = self._input_target_v - op_v
             advantage = tf.clip_by_value(advantage,
                                          -max_advantage,
                                          max_advantage,
                                          name="advantage")
             self._pi_loss = tf.reduce_mean(self._policy_dist.log_prob() *
                                            tf.stop_gradient(advantage))
             entropy_loss = self._input_entropy * tf.reduce_mean(
                 self._policy_dist.entropy())
         self._op_loss = -(self._pi_loss + entropy_loss)
     self._update_operation = network.MinimizeLoss(
         self._op_loss,
         var_list=self._policy_dist.dist_function().variables)
Exemple #7
0
    def __init__(self, forward_function, feature_function, policy_dist):
        super(ForwardUpdater, self).__init__()
        self._forward_function, self._feature_function, self._policy_dist = \
            forward_function, feature_function, policy_dist

        with tf.name_scope("ForwardUpdater"):
            op_phi_next_state_hat = forward_function.output().op
            op_phi_next_state = feature_function.output().op

            # forward loss calculation
            with tf.name_scope("forward"):
                forward_loss = 0.05 * tf.reduce_mean(tf.square(
                    tf.subtract(op_phi_next_state_hat, op_phi_next_state)),
                                                     name="forward_loss")
                self._forward_loss = forward_loss
            self._op_loss = self._forward_loss

        self._update_operation = network.MinimizeLoss(
            self._op_loss,
            var_list=self._forward_function.variables +
            self._feature_function.variables)
Exemple #8
0
    def __init__(self, actor, critic, target_estimator, discount_factor, actor_weight):
        """

        :param actor:
        :type actor network.NetworkFunction
        :param critic:
        :type critic network.NetworkFunction
        :param target_estimator:
        :type target_estimator: target_estimate.TargetEstimator
        """
        super(DPGUpdater, self).__init__()
        self._actor, self._critic, self._target_estimator = \
            actor, critic, target_estimator
        self._dim_action = actor.output().op.shape.as_list()[-1]
        op_q = critic.output().op
        with tf.name_scope("DPGUpdater"):
            with tf.name_scope("input"):
                self._input_target_q = tf.placeholder(dtype=tf.float32, shape=[None], name="input_target_q")
                self._input_action_gradient = tf.placeholder(dtype=tf.float32,
                                                             shape=[None, self._dim_action],
                                                             name="input_action_gradient")
            with tf.name_scope("critic"):
                self._critic_loss = tf.reduce_mean(network.Utils.clipped_square(
                    self._input_target_q - op_q
                ))
            with tf.name_scope("actor"):
                # critic.inputs[1] is input_action
                self._action_gradient = tf.gradients(critic.output().op, critic.inputs[1])[0]
                self._gradient_func = network.NetworkFunction(
                    outputs=network.NetworkSymbol(self._action_gradient, "gradient", critic.network),
                    inputs=critic.inputs
                )
                self._actor_loss = tf.reduce_sum(actor.output().op * self._input_action_gradient, axis=1)
                self._actor_loss = -tf.reduce_mean(self._actor_loss)

            self._op_loss = self._actor_loss * actor_weight + self._critic_loss
        self._update_operation = network.MinimizeLoss(self._op_loss,
                                                      var_list=self._actor.variables +
                                                               self._critic.variables)
Exemple #9
0
 def __init__(self,
              actor,
              critic,
              f_noise,
              target_estimator,
              discount_factor,
              actor_weight,
              actor_mean,
              zero_mean_weight=1e-2,
              stddev_weight=1e-4):
     super(DisentangleNoisyDPGUpdater,
           self).__init__(actor, critic, target_estimator, discount_factor,
                          actor_weight, actor_mean)
     self._f_noise = f_noise
     self._zero_mean_weight, self._stddev_weight = zero_mean_weight, stddev_weight
     with tf.name_scope("disentangle"):
         self._input_weight_mean = tf.placeholder(dtype=tf.float32,
                                                  name="weight_mean")
         self._input_weight_stddev = tf.placeholder(dtype=tf.float32,
                                                    name="weight_stddev")
         op_a, op_a_mean = self._actor.output().op, self._actor_mean.output(
         ).op
         # pull action mean close to noisy action
         self._zero_mean_loss = network.Utils.clipped_square(
             tf.stop_gradient(op_a) - op_a_mean)
         # push noisy action away from action mean
         self._stddev_loss = -network.Utils.clipped_square(
             f_noise.output().op)
         self._disentangle_loss = tf.reduce_mean(self._zero_mean_loss) * self._input_weight_mean \
                                  + tf.reduce_mean(self._stddev_loss) * self._input_weight_stddev
     self._op_loss = (self._actor_loss + self._actor_mean_loss) * actor_weight + self._critic_loss \
                     + self._disentangle_loss
     self._update_operation = network.MinimizeLoss(
         self._op_loss,
         var_list=self._actor.variables + self._critic.variables +
         self._actor_mean.variables)
Exemple #10
0
    def __init__(self,
                 net_se,
                 net_transition,
                 net_decoder,
                 state_shape,
                 dim_action,
                 curriculum=None,
                 skip_step=None,
                 transition_weight=0.0,
                 with_momentum=True,
                 compute_with_diff=False,
                 save_image_interval=1000,
                 detailed_decoder=False,
                 with_ob=False,
                 with_goal=True):
        super(EnvModelUpdater, self).__init__()
        if curriculum is None:
            self._curriculum = [1, 3, 5]
            self._skip_step = [5000, 15000]
        else:
            self._curriculum = curriculum
            self._skip_step = skip_step

        self._depth = self._curriculum[-1]
        self.save_image_interval = save_image_interval
        self._detailed_decoder = detailed_decoder

        if with_ob:
            with_momentum = False

        with tf.name_scope("EnvModelUpdater"):
            with tf.name_scope("input"):
                self._input_action = tf.placeholder(dtype=tf.uint8,
                                                    shape=[None],
                                                    name="input_action")
                self._input_state = tf.placeholder(dtype=tf.float32,
                                                   shape=[None] +
                                                   list(state_shape),
                                                   name="input_state")
                self._input_reward = tf.placeholder(dtype=tf.float32,
                                                    shape=[None],
                                                    name="input_reward")
                self._count = tf.placeholder(dtype=tf.int32, name="count")

            with tf.name_scope("inputs"):
                s0 = self._input_state[:-1]
                state_shape = tf.shape(self._input_state)[1:]

                f0 = s0[:, :, :, -3:]
                logging.warning("s0:%s, f0:%s", s0.shape, f0.shape)
                sn, an, rn, fn = [], [], [], []
                cur_ob = self._input_state[0:-1]
                for i in range(self._depth):
                    sn.append(self._input_state[i + 1:])
                    an.append(self._input_action[i:])
                    rn.append(self._input_reward[i:])
                    fn.append(sn[-1][:, :, :, -3:])

            with tf.name_scope("rollout"):
                ses_predict = []
                goalfrom0_predict = []
                momfrom0_predict = []
                action_relatedfrom0_predict = []
                r_predict = []
                r_predict_loss = []
                f_predict = []
                image_channel = None
                f_predict_loss = []
                transition_loss = []
                relative_transition_loss = []
                momentum_loss = []
                mom_decoder_predict = []
                action_related_decoder_predict = []
                if compute_with_diff:
                    diff_ob = []
                    for i in range(self._input_state.shape[-1] / 3 - 1):
                        diff_ob.append(
                            self._input_state[:, :, :,
                                              (i + 1) * 3:(i + 1) * 3 + 3] -
                            self._input_state[:, :, :, i * 3:i * 3 + 3])
                    ses = net_se([tf.concat(diff_ob[:], axis=3)])["se"].op
                else:
                    ses = net_se([self._input_state])["se"].op
                se0 = ses[:-1]
                sen = []
                for i in range(self._depth):
                    sen.append(ses[i + 1:])
                cur_se = se0
                cur_goal = None
                cur_mom = None
                cur_action_related = None
                se0_truncate, f0_truncate = se0, f0
                flows = []
                flow_regulations = []
                for i in range(self._depth):
                    logging.warning("[%s]: state:%s, action:%s", i,
                                    cur_se.shape, an[i].shape)
                    input_action = tf.one_hot(indices=an[i],
                                              depth=dim_action,
                                              on_value=1.0,
                                              off_value=0.0,
                                              axis=-1)
                    if not with_ob:
                        net_trans = net_transition([cur_se, input_action],
                                                   name_scope="transition_%d" %
                                                   i)
                    else:
                        net_trans = net_transition([cur_ob, input_action],
                                                   name_scope="transition_%d" %
                                                   i)

                    if with_momentum:
                        TM_goal = net_trans["momentum"].op
                        action_related = net_trans["action_related"].op
                        cur_mom = TM_goal if cur_goal is None else cur_goal + TM_goal
                        momfrom0_predict.append(cur_mom)
                        cur_action_related = action_related if cur_goal is None else cur_goal + action_related
                        action_relatedfrom0_predict.append(cur_action_related)
                        cur_se_mom = se0_truncate + cur_mom
                        cur_se_action_related = se0_truncate + cur_action_related
                        momentum_loss.append(
                            tf.reduce_mean(
                                network.Utils.clipped_square(cur_se_mom -
                                                             sen[i])))

                    goal = net_trans["next_state"].op
                    if not with_ob and with_goal:
                        # socalled_state = net_trans["action_related"].op
                        cur_goal = goal if cur_goal is None else tf.stop_gradient(
                            cur_goal) + goal
                        goalfrom0_predict.append(cur_goal)
                        cur_se = se0_truncate + cur_goal
                        # cur_se = socalled_state
                    elif not with_ob and not with_goal:
                        cur_goal = goal
                        cur_se = cur_goal
                    else:
                        cur_se = goal
                        cur_ob = tf.concat([cur_ob[:, :, :, 3:], goal],
                                           axis=-1)

                    ses_predict.append(cur_se)
                    r_predict.append(net_trans["reward"].op)
                    r_predict_loss.append(
                        tf.reduce_mean(
                            network.Utils.clipped_square(r_predict[-1] -
                                                         rn[i])))
                    # f_predict.append(net_decoder([tf.concat([se0, cur_goal], axis=1), f0],
                    #                              name_scope="frame_decoder%d" % i)["next_frame"].op)
                    if detailed_decoder:
                        mom_decoder_predict.append(
                            net_decoder([
                                tf.concat([se0_truncate, cur_se_mom], axis=1),
                                f0_truncate
                            ],
                                        name_scope="mom_decoder%d" %
                                        i)["next_frame"].op)
                        action_related_decoder_predict.append(
                            net_decoder([
                                tf.concat(
                                    [se0_truncate, cur_se_action_related],
                                    axis=1), f0_truncate
                            ],
                                        name_scope="action_related_decoder%d" %
                                        i)["next_frame"].op)

                    if not with_ob:
                        net_decoded = net_decoder(
                            [
                                tf.concat([se0_truncate, cur_goal], axis=1),
                                f0_truncate
                            ],
                            name_scope="frame_decoder%d" % i)
                    else:
                        net_decoded = net_decoder(
                            [cur_se], name_scope="frame_decoder%d" % i)
                    f_predict.append(net_decoded["next_frame"].op)
                    predicted_channel = net_decoded["image_channel"]
                    if predicted_channel is not None and image_channel is None:
                        image_channel = predicted_channel.op
                    frame_2 = net_decoded["frame_2"]
                    frame_losses = []
                    if frame_2 is not None:
                        sub_i = 1
                        while True:
                            sub = "frame_%d" % (2**sub_i)
                            sub_frame = net_decoded[sub]
                            if sub_frame is None:
                                break
                            sub_frame = sub_frame.op
                            frame_losses.append(
                                tf.reduce_mean(
                                    network.Utils.clipped_square(
                                        sub_frame - tf.image.resize_images(
                                            fn[i],
                                            sub_frame.shape.as_list()[1:3]))))
                            sub_i = sub_i + 1
                    flow = net_decoded["flow"]
                    if flow is not None:
                        flow = flow.op
                        flows.append(flow)
                        o1_y = flow[:, :-1, :, :] - flow[:, 1:, :, :]
                        o2_y = o1_y[:, :-1, :, :] - o1_y[:, 1:, :, :]
                        o1_x = flow[:, :, :-1, :] - flow[:, :, 1:, :]
                        o2_x = o1_x[:, :, :-1, :] - o1_x[:, :, 1:, :]
                        l1_y = tf.reduce_mean(tf.abs(o2_y))
                        l1_x = tf.reduce_mean(tf.abs(o2_x))
                        flow_regulations.append(l1_x + l1_y)

                    frame_losses.append(
                        tf.reduce_mean(
                            network.Utils.clipped_square(f_predict[-1] -
                                                         fn[i])))
                    f_predict_loss.append(frame_losses)
                    if not with_ob:
                        mean_se = tf.reduce_mean(sen[i], axis=0)
                        self._se_norm = tf.sqrt(
                            tf.reduce_sum(tf.square(mean_se)))
                        transition_loss.append(
                            tf.reduce_mean(
                                network.Utils.clipped_square(ses_predict[-1] -
                                                             sen[i])))
                        relative_transition_loss.append(
                            tf.reduce_mean(
                                tf.sqrt(
                                    tf.reduce_sum(
                                        tf.square(ses_predict[-1] - sen[i]),
                                        axis=-1)) /
                                # tf.sqrt(tf.reduce_sum(tf.square(sen[i] - mean_se), axis=-1))))
                                tf.sqrt(
                                    tf.reduce_sum(tf.square(sen[i]), axis=-1)
                                )))
                        cur_goal = cur_goal[:-1]
                        cur_se = cur_se[:-1]
                    else:
                        cur_ob = cur_ob[:-1]

                    f0_truncate = f0_truncate[:-1]
                    se0_truncate = se0_truncate[:-1]

                self._reward_loss = []
                self._env_loss = []
                self._transition_loss = []
                self._relative_transition_loss = []
                self._momentum_loss = []
                self._flow_regulation_loss = []
                for i in range(len(curriculum)):
                    self._reward_loss.append(
                        tf.reduce_mean(
                            tf.add_n(r_predict_loss[0:curriculum[i]]) /
                            float(curriculum[i]),
                            name="reward_loss%d" % curriculum[i]) / 2.0)

                    self._env_loss.append(
                        tf.reduce_mean(tf.add_n(
                            reduce(operator.add,
                                   f_predict_loss[0:curriculum[i]], [])) /
                                       float(curriculum[i]),
                                       name="env_loss%d" % curriculum[i]) /
                        2.0 * 255.0)

                    if not with_ob:
                        self._transition_loss.append(
                            tf.reduce_mean(
                                tf.add_n(transition_loss[0:curriculum[i]]) /
                                float(curriculum[i]),
                                name="transition_loss%d" % curriculum[i]))
                        self._relative_transition_loss.append(
                            tf.reduce_mean(tf.add_n(
                                relative_transition_loss[0:curriculum[i]]) /
                                           float(curriculum[i]),
                                           name="transition_loss%d" %
                                           curriculum[i]))
                    else:
                        self._transition_loss.append(0.0)
                        self._relative_transition_loss.append(0.0)

                    if with_momentum:
                        self._momentum_loss.append(
                            tf.reduce_mean(
                                tf.add_n(momentum_loss[0:curriculum[i]]) /
                                float(curriculum[i]),
                                name="momentum_loss%d" % curriculum[i]))
                    else:
                        self._momentum_loss.append(0.0)

                    if len(flow_regulations) > 0:
                        self._flow_regulation_loss.append(
                            tf.reduce_mean(
                                tf.add_n(flow_regulations[0:curriculum[i]]) /
                                float(curriculum[i]),
                                name="flow_loss%d" % curriculum[i]) * 1e-1)
                    else:
                        self._flow_regulation_loss.append(0.0)

                def loss_assign(index):
                    return tf.gather(self._env_loss, index), \
                           tf.gather(self._reward_loss, index), \
                           tf.gather(self._transition_loss, index), \
                           tf.gather(self._relative_transition_loss, index), \
                           tf.gather(self._momentum_loss, index), \
                           tf.gather(self._flow_regulation_loss, index), \
                           self._count

                self._env_loss, self._reward_loss, self._transition_loss, self._relative_transition_loss, \
                self._momentum_loss, self._flow_regulation_loss, self._num = \
                    loss_assign(tf.where(tf.equal(self._curriculum, self._count)))

                self._op_loss = self._env_loss \
                                + self._reward_loss \
                                + self._transition_loss \
                                + self._momentum_loss \
                                + self._flow_regulation_loss

            self._s0, self._f0, self._fn, self._f_predict = s0, f0, fn, f_predict
            self._mom_decoder_predict, self._action_related_decoder_predict = \
                mom_decoder_predict, action_related_decoder_predict

            self._flows = flows
            self._image_channel = image_channel
        self._update_operation = network.MinimizeLoss(
            self._op_loss,
            var_list=net_transition.variables + net_se.variables +
            net_decoder.variables)
        self.imshow_count = 0
        self.num = 1
Exemple #11
0
 def __init__(self,
              policy_dist,
              old_dist,
              v_function,
              old_v_function,
              target_estimator,
              entropy=1e-1,
              clip_epsilon=0.1,
              value_weight=1.0):
     """
     :param policy_dist:
     :type policy_dist: distribution.NNDistribution
     :param old_dist:
     :type old_dist: distribution.NNDistribution
     :param v_function: Function calculating state value
     :type v_function: network.NetworkFunction
     :param old_v_function: Function calculation old state value
     :type old_v_function: network.NetworkFunction
     :param target_estimator:
     :type target_estimator:
     :param entropy: entropy weight, c2 in paper
     :param value_weight: value function loss weight, c1 in paper
     :param clip_epsilon: clipped value of prob ratio
     """
     super(PPOUpdater, self).__init__()
     self._policy_dist, self._old_dist = policy_dist, old_dist
     self._v_function, self._old_v_function = v_function, old_v_function
     self._target_estimator = target_estimator
     self._entropy = entropy
     with tf.name_scope("PPOUpdater"):
         with tf.name_scope("input"):
             self._input_target_v = tf.placeholder(dtype=tf.float32,
                                                   shape=[None],
                                                   name="input_target_v")
             self._input_action = policy_dist.input_sample()
             self._input_entropy = tf.placeholder(dtype=tf.float32,
                                                  shape=[],
                                                  name="input_entropy")
         op_v = v_function.output().op
         old_op_v = tf.stop_gradient(old_v_function.output().op)
         with tf.name_scope("value"):
             td = self._input_target_v - op_v
             org_v_loss = network.Utils.clipped_square(td)
             clipped_v = old_op_v + tf.clip_by_value(
                 op_v - old_op_v, -clip_epsilon, clip_epsilon)
             clip_v_loss = network.Utils.clipped_square(
                 self._input_target_v - clipped_v)
             self._v_loss = tf.reduce_mean(
                 tf.maximum(org_v_loss, clip_v_loss))
             self._org_v_loss, self._clip_v_loss = org_v_loss, clip_v_loss
         with tf.name_scope("policy"):
             advantage = self._input_target_v - op_v
             self._advantage = advantage
             _mean, _var = tf.nn.moments(advantage, axes=[0])
             self._std_advantage = tf.stop_gradient(advantage /
                                                    (tf.sqrt(_var) + 1.0))
             ratio = tf.exp(policy_dist.log_prob() -
                            tf.stop_gradient(old_dist.log_prob()))
             clipped_ratio = tf.clip_by_value(ratio, 1.0 - clip_epsilon,
                                              1.0 + clip_epsilon)
             pi_loss = tf.reduce_mean(
                 tf.minimum(ratio * self._std_advantage,
                            clipped_ratio * self._std_advantage))
             entropy_loss = tf.reduce_mean(self._policy_dist.entropy())
             self._pi_loss = pi_loss
             self._ratio, self._clipped_ratio = ratio, clipped_ratio
         self._op_loss = value_weight * self._v_loss - (
             self._pi_loss + self._input_entropy * entropy_loss)
     self._update_operation = network.MinimizeLoss(
         self._op_loss,
         var_list=self._v_function.variables +
         self._policy_dist._dist_function.variables)