コード例 #1
0
ファイル: encoder.py プロジェクト: jiameij/vae-for-IL
    def _init(self, obs_space, batch_size, time_steps, LSTM_size, laten_size, gaussian_fixed_var=True): ##等会儿要重点看一下var有没有更新
        self.pdtype = pdtype = make_pdtype(laten_size)
        obs = U.get_placeholder("en_ob", dtype=tf.float32, shape = [batch_size, time_steps, obs_space.shape[0]])
        # 正则化
        with tf.variable_scope("obfilter"): ## 看看有没有起效果,我觉得是其效果考虑的
            self.obs_rms = RunningMeanStd(shape=obs_space.shape)

        obz = tf.clip_by_value((obs - self.obs_rms.mean) / self.obs_rms.std, -5.0, 5.0)

        lstm_fw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0)
        lstm_bw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0)
        outputs, output_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, obz, dtype=tf.float32)
        outputs_average = tf.reduce_mean(outputs[0], axis=1)
        if gaussian_fixed_var and isinstance(laten_size, int):
            self.mean = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstmfin", U.normc_initializer(1.0))
            self.logstd = U.dense(outputs_average,  pdtype.param_shape()[0] // 2, "dblstm_logstd", U.normc_initializer(1.0))
            # self.logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
            #                          initializer=tf.constant_initializer(0.1)) ##这个地方是不是也是有问题的
            pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1)

        else:
            pdparam = U.dense(outputs_average, pdtype.param_shape()[0], "dblstmfin", U.normc_initializer(0.1))

        self.pd = pdtype.pdfromflat(pdparam)
        self._encode = U.function([obs], self.pd.sample())
        self._get_mean = U.function([obs], self.mean)
コード例 #2
0
ファイル: policies.py プロジェクト: mjbigdel/tf2_grf
 def _build_actor_head(self):
     pdtypes = []
     input_shape = self.policy_network.output_shape
     for a in self.agent_ids:
         pdtypes.append(
             make_pdtype(input_shape, self.ac_space, init_scale=0.01))
     return pdtypes
コード例 #3
0
    def _init(self,
              obs_space,
              ac_space,
              embedding_shape,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        self.pdtype = pdtype = make_pdtype(ac_space.shape[0])
        batch_size = None

        ob = U.get_placeholder(name="ac_de_ob",
                               dtype=tf.float32,
                               shape=[batch_size, obs_space.shape[0]])
        embedding = U.get_placeholder(
            name="ac_de_embedding",
            dtype=tf.float32,
            shape=[batch_size, embedding_shape
                   ])  ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到
        # 正则化一下
        last_out = U.concatenate([ob, embedding], axis=1)
        with tf.variable_scope("ac_de_filter"):
            self.ac_rms = RunningMeanStd(shape=obs_space.shape[0] +
                                         embedding_shape)

        last_out = tf.clip_by_value(
            (last_out - self.ac_rms.mean) / self.ac_rms.std, -5.0, 5.0)

        for i in range(num_hid_layers):
            last_out = tf.nn.relu(
                U.dense(last_out,
                        hid_size[i],
                        "ac_de%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space.shape[0], int):
            self.mean = U.dense(last_out,
                                pdtype.param_shape()[0] // 2, "ac_de_final",
                                U.normc_initializer(1.0))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([self.mean, self.mean * 0.0 + logstd],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "ac_de_final",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob, embedding], ac)
        self._get_pol_mean = U.function([ob, embedding], self.mean)
コード例 #4
0
ファイル: maddpg.py プロジェクト: david-spc/Flight_maddpg
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
コード例 #5
0
    def _init(self,
              obs_space,
              embedding_shape,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        self.pdtype = pdtype = make_pdtype(obs_space.shape[0])
        batch_size = None

        ob_input = U.get_placeholder(name="ob",
                                     dtype=tf.float32,
                                     shape=[batch_size, obs_space.shape[0]])
        embedding = U.get_placeholder(
            name="embedding",
            dtype=tf.float32,
            shape=[
                batch_size, embedding_shape
            ])  ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到这里的时候再处理

        last_out = U.concatenate(
            [ob_input, embedding],
            axis=1)  ##这里只有policy, 没有 value function, 还有这个要看看concatenate的对不对
        # 正则化
        with tf.variable_scope("state_de_filter"):
            self.state_rms = RunningMeanStd(shape=obs_space.shape[0] +
                                            embedding_shape)

        input_z = tf.clip_by_value(
            (last_out - self.state_rms.mean) / self.state_rms.std, -5.0, 5.0)

        for i in range(num_hid_layers):
            input_z = tf.nn.tanh(
                U.dense(input_z,
                        hid_size[i],
                        "state_de%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(obs_space.shape[0], int):
            self.mean = U.dense(input_z,
                                pdtype.param_shape()[0] // 2, "state_de_final",
                                U.normc_initializer(0.01))
            self.logstd = tf.get_variable(
                name="logstd",
                shape=[1, pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "state_de_final",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        self._act = U.function([ob_input, embedding], self.pd.sample())
        self.get_mean = U.function([ob_input, embedding], self.mean)
コード例 #6
0
    def __init__(self,
                 env,
                 observations,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:, 0]

        #Lyapunov
        self.Lyapunov = fc(vf_latent, 'vf', 1)
        self.Lyapunov = self.Lyapunov[:, 0]
コード例 #7
0
ファイル: policies.py プロジェクト: mjbigdel/tf2_grf
    def __init__(self,
                 ac_space,
                 policy_network,
                 value_network=None,
                 estimate_q=False):
        """
        Parameters:
        ----------
        ac_space        action space

        policy_network  keras network for policy

        value_network   keras network for value

        estimate_q      q value or v value

        """

        self.policy_network = policy_network
        self.value_network = value_network or policy_network
        self.estimate_q = estimate_q
        self.initial_state = None
        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(self.policy_network.output_shape,
                                  ac_space,
                                  init_scale=0.01)

        if estimate_q:
            self.value_fc = fc_build(self.value_network.output_shape, 'q',
                                     ac_space.n)
        else:
            self.value_fc = fc_build(self.value_network.output_shape, 'vf', 1)

        # # to get just dense size and avoid batch size
        # print(f'self.value_network.output_shape for agent_0 {self.value_network.output_shape[-1]}')
        # value_model_inputes = tf.keras.layers.Input(self.value_network.output_shape[-1])  # agent 0 output
        #
        # if estimate_q:
        #     # value_fc = fc(scope='q', nh=ac_space.n)(policy_network.output)
        #     value_fc = tf.keras.layers.Dense(units=ac_space.n, kernel_initializer=ortho_init(init_scale),
        #                                      bias_initializer=tf.keras.initializers.Constant(init_bias),
        #                                      name=f'q')(value_model_inputes)
        # else:
        #     # value_fc = fc(scope='vf', nh=1)(policy_network.output)
        #     value_fc = tf.keras.layers.Dense(units=1, kernel_initializer=ortho_init(init_scale),
        #                                      bias_initializer=tf.keras.initializers.Constant(init_bias),
        #                                      name=f'vf')(value_model_inputes)

        # self.value_model = tf.keras.Model(inputs=value_model_inputes, outputs=value_fc, name='Value_Network')
        # self.value_model.summary()
        # tf.keras.utils.plot_model(self.value_model, to_file='./value_model.png')

        self.value_network.summary()
        self.policy_network.summary()
        tf.keras.utils.plot_model(self.policy_network,
                                  to_file='./policy_network.png')
コード例 #8
0
ファイル: matd3.py プロジェクト: yyywanng/MATD3implementation
def p_train(make_obs_ph_n,
            act_space_n,
            agent_idx,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    """

    :param make_obs_ph_n:
    :param act_space_n:
    :param agent_idx:
    :param p_func: in base maddpg code = mlp_model
    :param q_func: in base maddpg code = mlp_model
    :param optimizer:
    :param grad_norm_clipping:
    :param local_q_func:
    :param num_units:
    :param scope:
    :param reuse:
    :return:
    """
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n]
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[agent_idx]

        p = p_func(p_input,
                   int(act_pdtype_n[agent_idx].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[agent_idx].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[agent_idx] = act_pd.sample()  #act_pd.mode() #
        q_input = tf.concat(obs_ph_n + act_input_n, 1)

        q = q_func(q_input,
                   1,
                   scope="q_func" + str(1),
                   reuse=True,
                   num_units=num_units)[:, 0]

        loss = -tf.reduce_mean(q) + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=make_obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=act_sample)
        p_values = U.function([make_obs_ph_n[agent_idx]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[agent_idx].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[agent_idx].pdfromflat(
            target_p).sample()
        target_act = U.function(inputs=[make_obs_ph_n[agent_idx]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
コード例 #9
0
ファイル: ddpg_model.py プロジェクト: hcch0912/Pong
    def __init__(self, input_space, act_space, scope, args):
        self.input_shape = input_space
        self.act_space = act_space
        self.scope = scope
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        self.optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
        self.grad_norm_clipping = 0.5
        with tf.variable_scope(self.scope):
            act_pdtype = make_pdtype(act_space)

            # act_ph = act_pdtype.sample_placeholder([None], name= "action")
            act_ph = tf.placeholder(tf.float32, shape=(None, 1))
            if args.game == "RoboschoolPong-v1":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0]))
            elif args.game == "Pong-2p-v0":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0],
                                               input_space.shape[1],
                                               input_space.shape[2]))
            q_target = tf.placeholder(tf.float32, shape=(None, ))

            #build the world representation z
            z = conv_model(obs_ph, 20, scope="world_model")
            p_input = z

            p = mlp_model(p_input, 2, scope="p_func")
            p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

            act_pd = act_pdtype.pdfromflat(p)
            act_sample = act_pd.sample()

            p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

            q_input = tf.concat([z, act_sample], -1)
            q = mlp_model(q_input, 1, scope="q_func")
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
            pg_loss = -tf.reduce_mean(q)

            q_loss = tf.reduce_mean(tf.square(q - q_target))
            # q_reg = tf.reduce_mean(tf.square(q))
            q_optimize_expr = U.minimize_and_clip(self.optimizer, q_loss,
                                                  q_func_vars,
                                                  self.grad_norm_clipping)

            p_loss = pg_loss + p_reg * 1e-3

            p_optimize_expr = U.minimize_and_clip(self.optimizer, p_loss,
                                                  p_func_vars,
                                                  self.grad_norm_clipping)

            p_values = U.function([obs_ph], p)

            target_p = mlp_model(z, 2, scope="target_p_func")
            target_p_func_vars = U.scope_vars(
                U.absolute_scope_name("target_p_func"))

            target_q = mlp_model(q_input, 1, scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))
            target_act_sample = act_pdtype.pdfromflat(target_p).sample()

            self.update_target_p = make_update_exp(p_func_vars,
                                                   target_p_func_vars)
            self.update_target_q = make_update_exp(q_func_vars,
                                                   target_q_func_vars)

            self.act = U.function(inputs=[obs_ph], outputs=act_sample)
            self.target_act = U.function(inputs=[obs_ph],
                                         outputs=target_act_sample)
            self.p_train = U.function(inputs=[obs_ph] + [act_ph],
                                      outputs=p_loss,
                                      updates=[p_optimize_expr])
            self.q_train = U.function(inputs=[obs_ph] + [act_ph] + [q_target],
                                      outputs=q_loss,
                                      updates=[q_optimize_expr])
            self.q_values = U.function([obs_ph] + [act_ph], q)
            self.target_q_values = U.function([obs_ph] + [act_ph], target_q)
コード例 #10
0
ファイル: mlp_policy.py プロジェクト: jiameij/vae-for-IL
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              vae_pol_mean,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size[i],
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size[i],
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01)) + vae_pol_mean
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.constant_initializer(0.1))
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
コード例 #11
0
def learn(encoder,
          action_decorder,
          state_decorder,
          embedding_shape,
          *,
          dataset,
          logdir,
          batch_size,
          time_steps,
          epsilon=0.001,
          lr_rate=1e-3):
    lstm_encoder = encoder("lstm_encoder")
    ac_decoder = action_decorder("ac_decoder")
    state_decoder = state_decorder("state_decoder")  #换成了mlp
    obs = U.get_placeholder_cached(name="obs")  ##for encoder

    ob = U.get_placeholder_cached(name="ob")
    embedding = U.get_placeholder_cached(name="embedding")

    # obss = U.get_placeholder_cached(name="obss")  ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs
    #   ## for action decoder, 这个state decoder应该也是可以用的
    # embeddingss = U.get_placeholder_cached(name="embeddingss")
    ac = ac_decoder.pdtype.sample_placeholder([None])
    obs_out = state_decoder.pdtype.sample_placeholder([None])

    # p(z) 标准正太分布, state先验分布???是不是应该换成demonstration的标准正态分布???? 可以考虑一下这个问题
    from common.distributions import make_pdtype

    p_z_pdtype = make_pdtype(embedding_shape)
    p_z_params = U.concatenate([
        tf.zeros(shape=[embedding_shape], name="mean"),
        tf.zeros(shape=[embedding_shape], name="logstd")
    ],
                               axis=-1)
    p_z = p_z_pdtype.pdfromflat(p_z_params)

    recon_loss = -tf.reduce_mean(
        tf.reduce_sum(ac_decoder.pd.logp(ac) + state_decoder.pd.logp(obs_out),
                      axis=0))  ##这个地方还要再改
    kl_loss = lstm_encoder.pd.kl(p_z)  ##p(z):标准正太分布, 这个看起来是不是也不太对!!!!
    vae_loss = recon_loss + kl_loss  ###vae_loss 应该是一个batch的

    ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"])
    losses = [recon_loss, kl_loss, vae_loss]

    ## var_list
    var_list = []
    en_var_list = lstm_encoder.get_trainable_variables()
    var_list.extend(en_var_list)
    # ac_de_var_list = ac_decoder.get_trainable_variables()
    # var_list.extend(ac_de_var_list)
    state_de_var_list = state_decoder.get_trainable_variables()
    var_list.extend(state_de_var_list)
    # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss)
    compute_losses = U.function([obs, ob, embedding, ac, obs_out], losses)
    compute_grad = U.function([obs, ob, embedding, ac, obs_out],
                              U.flatgrad(vae_loss,
                                         var_list))  ###这里没有想好!!!,可能是不对的!!
    adam = MpiAdam(var_list, epsilon=epsilon)

    U.initialize()
    adam.sync()

    writer = U.FileWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    # =========================== TRAINING ===================== #
    iters_so_far = 0
    saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=100)
    saver_encoder = tf.train.Saver(var_list=en_var_list, max_to_keep=100)
    # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎

    while True:
        logger.log("********** Iteration %i ************" % iters_so_far)

        recon_loss_buffer = deque(maxlen=100)
        kl_loss_buffer = deque(maxlen=100)
        vae_loss_buffer = deque(maxlen=100)

        for observations in dataset.get_next_batch(batch_size=time_steps):
            observations = observations.transpose((1, 0))
            embedding_now = lstm_encoder.get_laten_vector(observations)
            embeddings = np.array([embedding_now for _ in range(time_steps)])
            embeddings_reshape = embeddings.reshape((time_steps, -1))
            actions = ac_decoder.act(stochastic=True,
                                     ob=observations,
                                     embedding=embeddings_reshape)
            state_outputs = state_decoder.get_outputs(
                observations.reshape(time_steps, -1, 1),
                embeddings)  ##还没有加混合高斯......乱加了一通,已经加完了
            recon_loss, kl_loss, vae_loss = compute_losses(
                observations, observations.reshape(batch_size, time_steps,
                                                   -1), embeddings_reshape,
                observations.reshape(time_steps, -1, 1), embeddings, actions,
                state_outputs)

            g = compute_grad(observations,
                             observations.reshape(batch_size, time_steps,
                                                  -1), embeddings_reshape,
                             observations.reshape(time_steps, -1, 1),
                             embeddings, actions, state_outputs)
            adam.update(g, lr_rate)
            recon_loss_buffer.append(recon_loss)
            kl_loss_buffer.append(kl_loss)
            vae_loss_buffer.append(vae_loss)

        ep_stats.add_all_summary(writer, [
            np.mean(recon_loss_buffer),
            np.mean(kl_loss_buffer),
            np.mean(vae_loss_buffer)
        ], iters_so_far)
        logger.record_tabular("recon_loss", recon_loss)
        logger.record_tabular("kl_loss", kl_loss)
        logger.record_tabular("vae_loss", vae_loss)
        logger.dump_tabular()
        if (iters_so_far % 10 == 0 and iters_so_far != 0):
            save(saver=saver,
                 sess=tf.get_default_session(),
                 logdir=logdir,
                 step=iters_so_far)
            save(saver=saver_encoder,
                 sess=tf.get_default_session(),
                 logdir="./vae_saver",
                 step=iters_so_far)
            # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far)
        iters_so_far += 1
コード例 #12
0
def learn(env, encoder, action_decorder, state_decorder, embedding_shape,*, dataset, optimizer, logdir, batch_size, time_steps, adam_epsilon = 0.001, lr_rate = 1e-4, vae_beta = 8):
    lstm_encoder = encoder("lstm_encoder")
    ac_decoder = action_decorder("ac_decoder")
    state_decoder = state_decorder("state_decoder") #这个地方有问题
    ac_de_ob = U.get_placeholder_cached(name="ac_de_ob")
    en_ob = U.get_placeholder_cached(name="en_ob")  ##for encoder
    state_de_ob = U.get_placeholder_cached(name="state_de_ob")  ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs
    ac_de_embedding = U.get_placeholder_cached(name="ac_de_embedding")  ## for action decoder, 这个state decoder应该也是可以用的
    state_de_embedding = U.get_placeholder_cached(name="state_de_embedding")
    # ac = ac_decoder.pdtype.sample_placeholder([None])
    ob_next = tf.placeholder(name="ob_next", shape=[None, ob_shape], dtype=tf.float32)
    # ob_next_ac = tf.placeholder(name="ob_next_ac", shape=[ob_shape], dtype=tf.float32)
    # obs_out = state_decoder.pdtype.sample_placeholder([None])

    # p(z) 标准正太分布
    from common.distributions import make_pdtype

    p_z_pdtype = make_pdtype(embedding_shape)
    p_z_params = U.concatenate([tf.zeros(shape=[embedding_shape], name="mean"), tf.zeros(shape=[embedding_shape], name="logstd")], axis=-1)
    p_z = p_z_pdtype.pdfromflat(p_z_params)

    # recon_loss 里再加一个,对于action的

    recon_loss =  -tf.reduce_sum(state_decoder.pd.logp(ob_next))
    # kl_loss = lstm_encoder.pd.kl(p_z)[0] ##p(z):标准正太分布, 这个看起来是不是也不太对!!!!
    # kl_loss = tf.maximum(lstm_encoder.pd.kl(p_z)[0], tf.constant(5.00)) ##p(z):标准正太分布, 这个看起来是不是也不太对!!!!
    kl_loss = lstm_encoder.pd.kl(p_z)[0]
    vae_loss = tf.reduce_mean(recon_loss + vae_beta * kl_loss) ###vae_loss 应该是一个batch的

    ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"])
    losses = [recon_loss, kl_loss, vae_loss]
    # 均方误差去训练 action,把得到的action step 一下,得到x(t+1),然后用均方误差loss,或者可以试试交叉熵


    ## var_list
    var_list = []
    en_var_list = lstm_encoder.get_trainable_variables()
    var_list.extend(en_var_list)
    # ac_de_var_list = ac_decoder.get_trainable_variables()
    # var_list.extend(ac_de_var_list)
    state_de_var_list = state_decoder.get_trainable_variables()
    var_list.extend(state_de_var_list)
    # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss)
    compute_losses = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], losses)
    compute_grad = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], U.flatgrad(vae_loss, var_list)) ###这里没有想好!!!,可能是不对的!!
    adam = MpiAdam(var_list, epsilon=adam_epsilon)


    U.initialize()
    adam.sync()

    writer = U.FileWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    # =========================== TRAINING ===================== #
    iters_so_far = 0
    saver = tf.train.Saver(var_list=var_list, max_to_keep=100)
    saver_encoder = tf.train.Saver(var_list = en_var_list, max_to_keep=100)
    # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎

    while iters_so_far < 50:
        ## 加多轮
        logger.log("********** Iteration %i ************" % iters_so_far)
        ## 要不要每一轮调整一下batch_size
        recon_loss_buffer = deque(maxlen=100)
        # recon_loss2_buffer = deque(maxlen=100)
        kl_loss_buffer = deque(maxlen=100)
        vae_loss_buffer = deque(maxlen=100)
        # i = 0
        for obs_and_next in dataset.get_next_batch(batch_size=time_steps):
            # print(i)
            # i += 1
            observations = obs_and_next[0].transpose((1, 0))[:-1]
            ob_next = obs_and_next[0].transpose(1, 0)[state_decoder.receptive_field:, :]
            embedding_now = lstm_encoder.get_laten_vector(obs_and_next[0].transpose((1, 0)))
            embeddings = np.array([embedding_now for _ in range(time_steps - 1)])
            embeddings_reshape = embeddings.reshape((time_steps-1, -1))
            actions = ac_decoder.act(stochastic=True, ob=observations, embedding=embeddings_reshape)
            ob_next_ac = get_ob_next_ac(env, observations[-1], actions[0]) ##这个还需要再修改 #########################################3
            # state_outputs = state_decoder.get_outputs(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1))) ##还没有加混合高斯......乱加了一通,已经加完了
            # recon_loss = state_decoder.recon_loss(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1)))
            recon_loss,  kl_loss, vae_loss = compute_losses(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1),
                              observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next)

            g = compute_grad(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1),
                              observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next)
            # logger.record_tabular("recon_loss", recon_loss)
            # logger.record_tabular("recon_loss2", recon_loss2)
            # logger.record_tabular("kl_loss", kl_loss)
            # logger.record_tabular("vae_loss", vae_loss)
            # logger.dump_tabular()
            adam.update(g, lr_rate)
            recon_loss_buffer.append(recon_loss)
            # recon_loss2_buffer.append(recon_loss2)
            kl_loss_buffer.append(kl_loss)
            vae_loss_buffer.append(vae_loss)
        ep_stats.add_all_summary(writer, [np.mean(recon_loss_buffer),  np.mean(kl_loss_buffer),
                                          np.mean(vae_loss_buffer)], iters_so_far)
        logger.record_tabular("recon_loss", recon_loss)
        # logger.record_tabular("recon_loss2", recon_loss2)
        logger.record_tabular("kl_loss", kl_loss)
        logger.record_tabular("vae_loss", vae_loss)
        logger.dump_tabular()
        if(iters_so_far % 10 == 0 and iters_so_far != 0):
            save(saver=saver, sess=tf.get_default_session(), logdir=logdir, step=iters_so_far)
            save(saver=saver_encoder, sess=tf.get_default_session(),logdir="./vae_saver", step=iters_so_far)
            # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far)
        iters_so_far += 1
        if iters_so_far < 6:
            lr_rate /= 2
コード例 #13
0
    def __init__(
            self,
            name,
            obs_shape,
            embedding_shape,
            batch_size,
            dilations,
            filter_width,
            residual_channels,  ##32
            dilation_channels,  ## 32
            skip_channels,
            quantization_channels=2**8,
            use_biases=False,
            scalar_input=False,
            initial_filter_width=obs_shape,
            histograms=False,
            global_condition_channels=None,  #None
            global_condition_cardinality=None):
        '''Initializes the WaveNet model.

        Args:
            batch_size: How many audio files are supplied per batch
                (recommended: 1).
            dilations: A list with the dilation factor for each layer.
            filter_width: The samples that are included in each convolution,
                after dilating. ???????
            residual_channels: How many filters to learn for the residual.
            dilation_channels: How many filters to learn for the dilated
                convolution.
            skip_channels: How many filters to learn that contribute to the
                quantized softmax output.
            quantization_channels: How many amplitude values to use for audio
                quantization and the corresponding one-hot encoding.
                Default: 256 (8-bit quantization).
            use_biases: Whether to add a bias layer to each convolution.
                Default: False.
            scalar_input: Whether to use the quantized waveform directly as
                input to the network instead of one-hot encoding it.
                Default: False.
            initial_filter_width: The width of the initial filter of the
                convolution applied to the scalar input. This is only relevant
                if scalar_input=True.
            histograms: Whether to store histograms in the summary.
                Default: False.
            global_condition_channels: Number of channels in (embedding
                size) of global conditioning vector. None indicates there is
                no global conditioning.
            global_condition_cardinality: Number of mutually exclusive
                categories to be embedded in global condition embedding. If
                not None, then this implies that global_condition tensor
                specifies an integer selecting which of the N global condition
                categories, where N = global_condition_cardinality. If None,
                then the global_condition tensor is regarded as a vector which
                must have dimension global_condition_channels.

        '''
        with tf.variable_scope(name):  #本行以及其下一行都是自己加的
            self.scope = tf.get_variable_scope().name

            self.time_steps = batch_size
            self.dilations = dilations
            self.filter_width = filter_width
            self.residual_channels = residual_channels
            self.dilation_channels = dilation_channels
            self.quantization_channels = quantization_channels
            self.use_biases = use_biases
            self.skip_channels = skip_channels  ##这个应该选成TRUE
            self.scalar_input = scalar_input
            self.initial_filter_width = initial_filter_width
            self.histograms = histograms
            self.global_condition_channels = global_condition_channels
            self.global_condition_cardinality = global_condition_cardinality

            self.receptive_field = WaveNetModel.calculate_receptive_field(
                self.filter_width, self.dilations, self.scalar_input,
                self.initial_filter_width)
            self.pdtype = pdtype = make_pdtype(obs_shape.shape[0])
            self.pd = None
            self.variables = self._create_variables()

            # sequence_length = None
            # #ob =
            # #embedding =

            self._create_network(obs_shape=obs_shape,
                                 embedding_shape=embedding_shape)