コード例 #1
0
    def __init__(self, act_space, lstm, gamma, scope="agent", **kwargs):
        self.act_space = act_space
        self.scope = scope

        self.s_t = kwargs.get("s")
        self.previous_actions = kwargs.get("prev_a")
        self.state_in = kwargs.get("state_in")
        self.slots = tf.cast(kwargs.get("slots"), tf.float32)

        prev_a = tf.one_hot(self.previous_actions,
                            depth=act_space,
                            dtype=tf.float32)

        s_feature, self.state_out = self.feature_net(self.s_t, lstm, prev_a,
                                                     self.state_in,
                                                     scope + "_feature")

        self.current_act_logits = self.a_net(s_feature, scope + "_a")
        self.current_act = tf.squeeze(categorical(self.current_act_logits),
                                      axis=-1)

        self.vf = self.v_net(s_feature, scope + "_value") * self.slots

        self.bootstrap_s = kwargs.get("bootstrap_s")
        if self.bootstrap_s is not None:
            self.bootstrap_slots = tf.cast(kwargs.get("bootstrap_slots"),
                                           tf.float32)
            self.r_t = kwargs.get("r")
            self.old_vf = kwargs.get("v_cur")
            self.old_act_logits = kwargs.get("a_logits")
            self.a_t = kwargs.get("a")
            a_onehot = tf.one_hot(self.a_t, depth=act_space, dtype=tf.float32)

            bootstrap_feature, _ = self.feature_net(
                self.bootstrap_s[:, None, :, :, :], lstm,
                a_onehot[:, -2:-1, :], self.state_out, scope + "_feature")

            bootstrap_feature = bootstrap_feature[:, -1, :]
            bootstrap_value = self.v_net(
                bootstrap_feature, scope + "_value") * self.bootstrap_slots

            vtrace = vtrace_from_logits(
                self.old_act_logits, self.current_act_logits, self.a_t,
                gamma * tf.ones_like(self.a_t, tf.float32), self.r_t, self.vf,
                bootstrap_value)

            self.vs = vtrace.vs
            self.adv = vtrace.advantages
            self.pg_adv = vtrace.pg_advantages
コード例 #2
0
ファイル: policy_graph.py プロジェクト: hybug/test_ppo
        def __init__(self,
                     act_space,
                     rnn,
                     use_rmc,
                     use_hrnn,
                     use_reward_prediction,
                     after_rnn,
                     use_pixel_control,
                     use_pixel_reconstruction,
                     scope="agent",
                     **kwargs):
            self.act_space = act_space
            self.scope = scope
            self.use_rmc = use_rmc
            self.use_hrnn = use_hrnn

            self.s_t = kwargs.get("s")
            self.previous_actions = kwargs.get("prev_a")
            self.prev_r = kwargs.get("prev_r")
            self.state_in = kwargs.get("state_in")

            prev_a = tf.one_hot(self.previous_actions,
                                depth=act_space,
                                dtype=tf.float32)

            self.feature, self.cnn_feature, self.image_feature, self.state_out = self.feature_net(
                self.s_t, rnn, prev_a, self.prev_r, self.state_in,
                scope + "_current_feature")

            if self.use_hrnn:
                self.p_zs = self.feature["p_zs"]
                self.p_mus = self.feature["p_mus"]
                self.p_sigmas = self.feature["p_sigmas"]
                self.q_mus = self.feature["q_mus"]
                self.q_sigmas = self.feature["q_sigmas"]
                self.feature = self.feature["q_zs"]

            self.current_act_logits = self.a_net(self.feature,
                                                 scope + "_acurrent")
            self.current_act = tf.squeeze(categorical(self.current_act_logits),
                                          axis=-1)

            self.current_value = self.v_net(self.feature, scope + "_ccurrent")

            advantage = kwargs.get("adv", None)
            if advantage is not None:
                self.old_current_value = kwargs.get("v_cur")
                self.ret = advantage + self.old_current_value

                self.a_t = kwargs.get("a")
                self.behavior_logits = kwargs.get("a_logits")
                self.r_t = kwargs.get("r")

                self.adv_mean = tf.reduce_mean(advantage, axis=[0, 1])
                advantage -= self.adv_mean
                self.adv_std = tf.math.sqrt(
                    tf.reduce_mean(advantage**2, axis=[0, 1]))
                self.advantage = advantage / tf.maximum(self.adv_std, 1e-12)

                self.slots = tf.cast(kwargs.get("slots"), tf.float32)

                if use_reward_prediction:
                    if after_rnn:
                        self.reward_prediction = self.r_net(
                            self.feature, "r_net")
                    else:
                        self.reward_prediction = self.r_net(
                            self.cnn_feature, "r_net")

                if use_pixel_reconstruction:
                    self.pixel_reconstruction = self.reconstruct_net(
                        self.feature)

                if use_pixel_control:
                    self.pixel_control = self.control_net(self.feature)
コード例 #3
0
ファイル: policy_graph.py プロジェクト: hybug/test_ppo
        def __init__(self,
                     act_space,
                     gamma,
                     n_step,
                     rnn,
                     use_hrnn,
                     use_rmc,
                     use_amc,
                     use_beta,
                     use_reward_prediction,
                     after_rnn,
                     use_pixel_control,
                     is_training=False,
                     **kwargs):
            self.act_space = act_space
            self.n_step = n_step
            self.use_hrnn = use_hrnn
            self.use_rmc = use_rmc
            self.use_amc = use_amc

            self.s = kwargs.get("s")
            self.a = kwargs.get("a")
            self.r = kwargs.get("r")
            self.state_in = kwargs.get("state_in")

            feature, self.state_out = self.feature_net(self.s, rnn, self.a,
                                                       self.r, self.state_in)

            if self.use_hrnn:
                self.p_zs = feature["p_zs"]
                self.p_mus = feature["p_mus"]
                self.p_sigmas = feature["p_sigmas"]
                self.q_mus = feature["q_mus"]
                self.q_sigmas = feature["q_sigmas"]
                feature = feature["q_zs"]

            with tf.variable_scope("alpha", reuse=tf.AUTO_REUSE):
                alpha = tf.get_variable(name="alpha",
                                        shape=(1, 1, 1),
                                        dtype=tf.float32,
                                        initializer=tf.zeros_initializer())
            tf.summary.scalar("alpha", tf.reduce_mean(alpha))
            alpha = tf.log(1.0 + tf.exp(alpha))

            self.qf, self.current_value, self.current_act_logits = self.q_fn(
                feature, alpha, use_beta, "q")

            self.current_act = tf.squeeze(categorical(self.current_act_logits),
                                          axis=-1)

            if is_training:
                self.mask = tf.cast(kwargs.get("mask"), tf.float32)
                self.behavior_logits = kwargs.get("a_logits")
                self.old_vf = kwargs.get("v_cur")
                self.current_value = self.current_value * self.mask
                '''
                get qa & qa1 & n_step_rewards
                '''
                self.qa = tf.reduce_sum(
                    tf.one_hot(self.a[:, 1:1 - self.n_step],
                               depth=self.act_space,
                               dtype=tf.float32) * self.qf[:, :-n_step],
                    axis=-1) * self.mask[:, :-n_step]

                self.qf1, _, _ = self.q_fn(feature, alpha, use_beta,
                                           "q_target")
                q1f = self.qf[:, n_step:, :]
                q1f1 = self.qf1[:, n_step:, :]
                self.qa1 = doubleQ(q1f1, q1f) * self.mask[:, n_step:]
                # self.q1f = self.qf[:, n_step:, :]
                # self.qa1 = tf.reduce_max(self.q1f, axis=-1) * self.mask[:, n_step:]

                gammas = tf.pow(
                    gamma, tf.range(0, get_shape(self.r)[1], dtype=tf.float32))
                gammas_1 = 1.0 / gammas
                returns = tf.cumsum(self.r * gammas[None, :], axis=1)
                discount_n_step_rewards = returns[:,
                                                  n_step:] - returns[:, :
                                                                     -n_step]
                self.n_step_rewards = discount_n_step_rewards * gammas_1[
                    None, :-n_step]

                self.n_step_qs = tf.stop_gradient(self.n_step_rewards +
                                                  gamma**n_step * self.qa1)

                # target_values = tf.reduce_sum(
                #     tf.one_hot(
                #         self.a[:, 1: 1 - self.n_step],
                #         depth=self.act_space, dtype=tf.float32
                #     ) * self.qf1[:, :-n_step], axis=-1) * self.mask[:, :-n_step]

                retrace = retrace_from_logits(
                    self.behavior_logits[:, :-n_step, :],
                    self.current_act_logits[:, :-n_step, :],
                    self.a[:, 1:1 - n_step],
                    gamma * tf.ones_like(self.a[:, 1:1 - n_step], tf.float32),
                    tf.ones_like(self.a[:, 1:1 - n_step],
                                 tf.float32), self.r[:, 1:1 - n_step], self.qa,
                    self.qa, self.qa1[:, -n_step])

                self.retrace_qs = retrace.qs
                '''
                get vtrace
                '''
                vtrace = vtrace_from_logits(
                    self.behavior_logits[:, :-n_step, :],
                    self.current_act_logits[:, :-n_step, :],
                    self.a[:, 1:1 - n_step],
                    gamma * tf.ones_like(self.a[:, 1:1 - n_step], tf.float32),
                    self.r[:, 1:1 - n_step], self.current_value[:, :-n_step],
                    self.current_value[:, -n_step])

                self.vs = vtrace.vs
                self.adv = vtrace.advantages
                self.pg_adv = vtrace.pg_advantages

                self.adv_mean = tf.reduce_mean(self.adv)
                advantages = self.adv - self.adv_mean
                self.adv_std = tf.math.sqrt(tf.reduce_mean(advantages**2))

                if use_reward_prediction:
                    if after_rnn:
                        self.reward_prediction = self.r_net(
                            feature[:, :-n_step, :])
                    else:
                        raise ValueError("only after rnn")

                if use_pixel_control:
                    self.pixel_control = self.control_net(
                        feature[:, :-n_step, :])
コード例 #4
0
        def __init__(self,
                     act_space,
                     gamma,
                     n_step,
                     use_soft,
                     rnn,
                     use_hrnn,
                     use_reward_prediction,
                     after_rnn,
                     use_pixel_control,
                     is_training=False,
                     **kwargs):
            self.act_space = act_space
            self.n_step = n_step
            self.use_hrnn = use_hrnn

            self.s = kwargs.get("s")
            self.a = kwargs.get("a")
            self.r = kwargs.get("r")
            self.state_in = kwargs.get("state_in")

            feature, self.state_out = self.feature_net(
                self.s, rnn, self.a, self.r, self.state_in)

            if self.use_hrnn:
                self.p_zs = feature["p_zs"]
                self.p_mus = feature["p_mus"]
                self.p_sigmas = feature["p_sigmas"]
                self.q_mus = feature["q_mus"]
                self.q_sigmas = feature["q_sigmas"]
                feature = feature["q_zs"]

            self.qf = self.q_fn(feature, "q")

            if use_soft:
                with tf.variable_scope("temperature", reuse=tf.AUTO_REUSE):
                    temperature = tf.get_variable(
                        name="temperature",
                        shape=(1, 1, 1),
                        dtype=tf.float32,
                        initializer=tf.ones_initializer())
                temperature = tf.log(1.0 + tf.exp(temperature))
                self.qf_logits = temperature * self.qf
                self.current_act = tf.squeeze(
                    categorical(self.qf_logits), axis=-1)
            else:
                self.current_act = tf.argmax(self.qf, axis=-1)

            if is_training:
                self.mask = tf.cast(kwargs.get("mask"), tf.float32)

                self.qa = tf.reduce_sum(
                    tf.one_hot(
                        self.a[:, 1: 1 - self.n_step],
                        depth=self.act_space, dtype=tf.float32
                    ) * self.qf[:, :-self.n_step], axis=-1) * self.mask[:, :-n_step]

                feature1 = feature[:, n_step:, :]

                self.q1f1 = self.q_fn(feature1, "q_target")

                self.q1f = self.q_fn(feature1, "q")

                self.qa1 = doubleQ(self.q1f1, self.q1f) * self.mask[:, n_step:]

                gammas = tf.pow(
                    gamma, tf.range(0, get_shape(self.r)[1], dtype=tf.float32))
                gammas_1 = 1.0 / gammas

                returns = tf.cumsum(self.r * gammas[None, :], axis=1)
                discount_n_step_rewards = returns[:, n_step:] - returns[:, :-n_step]

                self.n_step_rewards = discount_n_step_rewards * gammas_1[None, :-n_step]

                if use_reward_prediction:
                    if after_rnn:
                        self.reward_prediction = self.r_net(feature[:, :-n_step, :])
                    else:
                        raise ValueError("only after rnn")

                if use_pixel_control:
                    self.pixel_control = self.control_net(feature[:, :-n_step, :])
コード例 #5
0
    def __init__(self,
                 act_space,
                 lstm,
                 gamma,
                 use_double,
                 scope="agent",
                 **kwargs):
        self.act_space = act_space
        self.scope = scope

        self.s = kwargs.get("s")
        self.prev_a = kwargs.get("prev_a")
        self.state_in = kwargs.get("state_in")
        self.slots = tf.cast(kwargs.get("slots"), tf.float32)

        feature, self.state_out = self.feature_net(self.s, lstm, self.prev_a,
                                                   self.state_in)

        self.qf, self.vf, self.act_logits = self.head_fn(
            feature, self.slots, "current")

        self.act = tf.squeeze(categorical(self.act_logits), axis=-1)

        self.bootstrap_s = kwargs.get("bootstrap_s")
        if self.bootstrap_s is not None:
            self.bootstrap_prev_a = kwargs.get("bootstrap_prev_a")
            self.bootstrap_slots = tf.cast(kwargs.get("bootstrap_slots"),
                                           tf.float32)
            self.a = kwargs.get("a")
            self.r = kwargs.get("r")

            self.old_act_logits = kwargs.get("a_logits")
            self.n_step_r = kwargs.get("n_step_r")
            self.v_cur = kwargs.get("v_cur")
            self.advantage = kwargs.get("adv")
            self.v_tar = kwargs.get("v_tar")

            self.qa = tf.reduce_sum(
                tf.one_hot(self.a, depth=self.act_space, dtype=tf.float32) *
                self.qf,
                axis=-1)

            bootstrap_feature, _ = self.feature_net(self.bootstrap_s, lstm,
                                                    self.bootstrap_prev_a,
                                                    self.state_out)

            n_step = get_shape(bootstrap_feature)[1]

            feature1 = tf.concat([feature[:, n_step:, :], bootstrap_feature],
                                 axis=1)
            slots1 = tf.concat([self.slots[:, n_step:], self.bootstrap_slots],
                               axis=1)

            self.q1f, self.v1f, _ = self.head_fn(feature1, slots1, "current")

            if use_double:
                self.q1f1, self.v1f1, _ = self.head_fn(feature1, slots1,
                                                       "target")
                self.qa1 = doubleQ(self.q1f1, self.q1f)
            else:
                self.qa1 = tf.reduce_max(self.q1f, axis=-1)

            vtrace = vtrace_from_logits(
                self.old_act_logits, self.act_logits, self.a,
                gamma * tf.ones_like(self.a, tf.float32), self.r,
                h_inv(self.vf), h_inv(self.v1f[:, 0]))

            self.vtrace_advantage = vtrace.advantages
            self.vtrace_vf = h(vtrace.vs)