def _init(self, obs_space, batch_size, time_steps, LSTM_size, laten_size, gaussian_fixed_var=True): ##等会儿要重点看一下var有没有更新 self.pdtype = pdtype = make_pdtype(laten_size) obs = U.get_placeholder("en_ob", dtype=tf.float32, shape = [batch_size, time_steps, obs_space.shape[0]]) # 正则化 with tf.variable_scope("obfilter"): ## 看看有没有起效果,我觉得是其效果考虑的 self.obs_rms = RunningMeanStd(shape=obs_space.shape) obz = tf.clip_by_value((obs - self.obs_rms.mean) / self.obs_rms.std, -5.0, 5.0) lstm_fw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) outputs, output_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, obz, dtype=tf.float32) outputs_average = tf.reduce_mean(outputs[0], axis=1) if gaussian_fixed_var and isinstance(laten_size, int): self.mean = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstmfin", U.normc_initializer(1.0)) self.logstd = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstm_logstd", U.normc_initializer(1.0)) # self.logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], # initializer=tf.constant_initializer(0.1)) ##这个地方是不是也是有问题的 pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(outputs_average, pdtype.param_shape()[0], "dblstmfin", U.normc_initializer(0.1)) self.pd = pdtype.pdfromflat(pdparam) self._encode = U.function([obs], self.pd.sample()) self._get_mean = U.function([obs], self.mean)
def _build_actor_head(self): pdtypes = [] input_shape = self.policy_network.output_shape for a in self.agent_ids: pdtypes.append( make_pdtype(input_shape, self.ac_space, init_scale=0.01)) return pdtypes
def _init(self, obs_space, ac_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(ac_space.shape[0]) batch_size = None ob = U.get_placeholder(name="ac_de_ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="ac_de_embedding", dtype=tf.float32, shape=[batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到 # 正则化一下 last_out = U.concatenate([ob, embedding], axis=1) with tf.variable_scope("ac_de_filter"): self.ac_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) last_out = tf.clip_by_value( (last_out - self.ac_rms.mean) / self.ac_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size[i], "ac_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space.shape[0], int): self.mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "ac_de_final", U.normc_initializer(1.0)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "ac_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob, embedding], ac) self._get_pol_mean = U.function([ob, embedding], self.mean)
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def _init(self, obs_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(obs_space.shape[0]) batch_size = None ob_input = U.get_placeholder(name="ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="embedding", dtype=tf.float32, shape=[ batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到这里的时候再处理 last_out = U.concatenate( [ob_input, embedding], axis=1) ##这里只有policy, 没有 value function, 还有这个要看看concatenate的对不对 # 正则化 with tf.variable_scope("state_de_filter"): self.state_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) input_z = tf.clip_by_value( (last_out - self.state_rms.mean) / self.state_rms.std, -5.0, 5.0) for i in range(num_hid_layers): input_z = tf.nn.tanh( U.dense(input_z, hid_size[i], "state_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(obs_space.shape[0], int): self.mean = U.dense(input_z, pdtype.param_shape()[0] // 2, "state_de_final", U.normc_initializer(0.01)) self.logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "state_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] self._act = U.function([ob_input, embedding], self.pd.sample()) self.get_mean = U.function([ob_input, embedding], self.mean)
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:, 0] #Lyapunov self.Lyapunov = fc(vf_latent, 'vf', 1) self.Lyapunov = self.Lyapunov[:, 0]
def __init__(self, ac_space, policy_network, value_network=None, estimate_q=False): """ Parameters: ---------- ac_space action space policy_network keras network for policy value_network keras network for value estimate_q q value or v value """ self.policy_network = policy_network self.value_network = value_network or policy_network self.estimate_q = estimate_q self.initial_state = None # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(self.policy_network.output_shape, ac_space, init_scale=0.01) if estimate_q: self.value_fc = fc_build(self.value_network.output_shape, 'q', ac_space.n) else: self.value_fc = fc_build(self.value_network.output_shape, 'vf', 1) # # to get just dense size and avoid batch size # print(f'self.value_network.output_shape for agent_0 {self.value_network.output_shape[-1]}') # value_model_inputes = tf.keras.layers.Input(self.value_network.output_shape[-1]) # agent 0 output # # if estimate_q: # # value_fc = fc(scope='q', nh=ac_space.n)(policy_network.output) # value_fc = tf.keras.layers.Dense(units=ac_space.n, kernel_initializer=ortho_init(init_scale), # bias_initializer=tf.keras.initializers.Constant(init_bias), # name=f'q')(value_model_inputes) # else: # # value_fc = fc(scope='vf', nh=1)(policy_network.output) # value_fc = tf.keras.layers.Dense(units=1, kernel_initializer=ortho_init(init_scale), # bias_initializer=tf.keras.initializers.Constant(init_bias), # name=f'vf')(value_model_inputes) # self.value_model = tf.keras.Model(inputs=value_model_inputes, outputs=value_fc, name='Value_Network') # self.value_model.summary() # tf.keras.utils.plot_model(self.value_model, to_file='./value_model.png') self.value_network.summary() self.policy_network.summary() tf.keras.utils.plot_model(self.policy_network, to_file='./policy_network.png')
def p_train(make_obs_ph_n, act_space_n, agent_idx, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): """ :param make_obs_ph_n: :param act_space_n: :param agent_idx: :param p_func: in base maddpg code = mlp_model :param q_func: in base maddpg code = mlp_model :param optimizer: :param grad_norm_clipping: :param local_q_func: :param num_units: :param scope: :param reuse: :return: """ with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n] act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[agent_idx] p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[agent_idx].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[agent_idx] = act_pd.sample() #act_pd.mode() # q_input = tf.concat(obs_ph_n + act_input_n, 1) q = q_func(q_input, 1, scope="q_func" + str(1), reuse=True, num_units=num_units)[:, 0] loss = -tf.reduce_mean(q) + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=make_obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=act_sample) p_values = U.function([make_obs_ph_n[agent_idx]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[agent_idx].pdfromflat( target_p).sample() target_act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def __init__(self, input_space, act_space, scope, args): self.input_shape = input_space self.act_space = act_space self.scope = scope self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) self.grad_norm_clipping = 0.5 with tf.variable_scope(self.scope): act_pdtype = make_pdtype(act_space) # act_ph = act_pdtype.sample_placeholder([None], name= "action") act_ph = tf.placeholder(tf.float32, shape=(None, 1)) if args.game == "RoboschoolPong-v1": obs_ph = tf.placeholder(tf.float32, shape=(None, input_space.shape[0])) elif args.game == "Pong-2p-v0": obs_ph = tf.placeholder(tf.float32, shape=(None, input_space.shape[0], input_space.shape[1], input_space.shape[2])) q_target = tf.placeholder(tf.float32, shape=(None, )) #build the world representation z z = conv_model(obs_ph, 20, scope="world_model") p_input = z p = mlp_model(p_input, 2, scope="p_func") p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) act_pd = act_pdtype.pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) q_input = tf.concat([z, act_sample], -1) q = mlp_model(q_input, 1, scope="q_func") q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) pg_loss = -tf.reduce_mean(q) q_loss = tf.reduce_mean(tf.square(q - q_target)) # q_reg = tf.reduce_mean(tf.square(q)) q_optimize_expr = U.minimize_and_clip(self.optimizer, q_loss, q_func_vars, self.grad_norm_clipping) p_loss = pg_loss + p_reg * 1e-3 p_optimize_expr = U.minimize_and_clip(self.optimizer, p_loss, p_func_vars, self.grad_norm_clipping) p_values = U.function([obs_ph], p) target_p = mlp_model(z, 2, scope="target_p_func") target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) target_q = mlp_model(q_input, 1, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) target_act_sample = act_pdtype.pdfromflat(target_p).sample() self.update_target_p = make_update_exp(p_func_vars, target_p_func_vars) self.update_target_q = make_update_exp(q_func_vars, target_q_func_vars) self.act = U.function(inputs=[obs_ph], outputs=act_sample) self.target_act = U.function(inputs=[obs_ph], outputs=target_act_sample) self.p_train = U.function(inputs=[obs_ph] + [act_ph], outputs=p_loss, updates=[p_optimize_expr]) self.q_train = U.function(inputs=[obs_ph] + [act_ph] + [q_target], outputs=q_loss, updates=[q_optimize_expr]) self.q_values = U.function([obs_ph] + [act_ph], q) self.target_q_values = U.function([obs_ph] + [act_ph], target_q)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, vae_pol_mean, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) + vae_pol_mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(0.1)) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def learn(encoder, action_decorder, state_decorder, embedding_shape, *, dataset, logdir, batch_size, time_steps, epsilon=0.001, lr_rate=1e-3): lstm_encoder = encoder("lstm_encoder") ac_decoder = action_decorder("ac_decoder") state_decoder = state_decorder("state_decoder") #换成了mlp obs = U.get_placeholder_cached(name="obs") ##for encoder ob = U.get_placeholder_cached(name="ob") embedding = U.get_placeholder_cached(name="embedding") # obss = U.get_placeholder_cached(name="obss") ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs # ## for action decoder, 这个state decoder应该也是可以用的 # embeddingss = U.get_placeholder_cached(name="embeddingss") ac = ac_decoder.pdtype.sample_placeholder([None]) obs_out = state_decoder.pdtype.sample_placeholder([None]) # p(z) 标准正太分布, state先验分布???是不是应该换成demonstration的标准正态分布???? 可以考虑一下这个问题 from common.distributions import make_pdtype p_z_pdtype = make_pdtype(embedding_shape) p_z_params = U.concatenate([ tf.zeros(shape=[embedding_shape], name="mean"), tf.zeros(shape=[embedding_shape], name="logstd") ], axis=-1) p_z = p_z_pdtype.pdfromflat(p_z_params) recon_loss = -tf.reduce_mean( tf.reduce_sum(ac_decoder.pd.logp(ac) + state_decoder.pd.logp(obs_out), axis=0)) ##这个地方还要再改 kl_loss = lstm_encoder.pd.kl(p_z) ##p(z):标准正太分布, 这个看起来是不是也不太对!!!! vae_loss = recon_loss + kl_loss ###vae_loss 应该是一个batch的 ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"]) losses = [recon_loss, kl_loss, vae_loss] ## var_list var_list = [] en_var_list = lstm_encoder.get_trainable_variables() var_list.extend(en_var_list) # ac_de_var_list = ac_decoder.get_trainable_variables() # var_list.extend(ac_de_var_list) state_de_var_list = state_decoder.get_trainable_variables() var_list.extend(state_de_var_list) # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss) compute_losses = U.function([obs, ob, embedding, ac, obs_out], losses) compute_grad = U.function([obs, ob, embedding, ac, obs_out], U.flatgrad(vae_loss, var_list)) ###这里没有想好!!!,可能是不对的!! adam = MpiAdam(var_list, epsilon=epsilon) U.initialize() adam.sync() writer = U.FileWriter(logdir) writer.add_graph(tf.get_default_graph()) # =========================== TRAINING ===================== # iters_so_far = 0 saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=100) saver_encoder = tf.train.Saver(var_list=en_var_list, max_to_keep=100) # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎 while True: logger.log("********** Iteration %i ************" % iters_so_far) recon_loss_buffer = deque(maxlen=100) kl_loss_buffer = deque(maxlen=100) vae_loss_buffer = deque(maxlen=100) for observations in dataset.get_next_batch(batch_size=time_steps): observations = observations.transpose((1, 0)) embedding_now = lstm_encoder.get_laten_vector(observations) embeddings = np.array([embedding_now for _ in range(time_steps)]) embeddings_reshape = embeddings.reshape((time_steps, -1)) actions = ac_decoder.act(stochastic=True, ob=observations, embedding=embeddings_reshape) state_outputs = state_decoder.get_outputs( observations.reshape(time_steps, -1, 1), embeddings) ##还没有加混合高斯......乱加了一通,已经加完了 recon_loss, kl_loss, vae_loss = compute_losses( observations, observations.reshape(batch_size, time_steps, -1), embeddings_reshape, observations.reshape(time_steps, -1, 1), embeddings, actions, state_outputs) g = compute_grad(observations, observations.reshape(batch_size, time_steps, -1), embeddings_reshape, observations.reshape(time_steps, -1, 1), embeddings, actions, state_outputs) adam.update(g, lr_rate) recon_loss_buffer.append(recon_loss) kl_loss_buffer.append(kl_loss) vae_loss_buffer.append(vae_loss) ep_stats.add_all_summary(writer, [ np.mean(recon_loss_buffer), np.mean(kl_loss_buffer), np.mean(vae_loss_buffer) ], iters_so_far) logger.record_tabular("recon_loss", recon_loss) logger.record_tabular("kl_loss", kl_loss) logger.record_tabular("vae_loss", vae_loss) logger.dump_tabular() if (iters_so_far % 10 == 0 and iters_so_far != 0): save(saver=saver, sess=tf.get_default_session(), logdir=logdir, step=iters_so_far) save(saver=saver_encoder, sess=tf.get_default_session(), logdir="./vae_saver", step=iters_so_far) # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far) iters_so_far += 1
def learn(env, encoder, action_decorder, state_decorder, embedding_shape,*, dataset, optimizer, logdir, batch_size, time_steps, adam_epsilon = 0.001, lr_rate = 1e-4, vae_beta = 8): lstm_encoder = encoder("lstm_encoder") ac_decoder = action_decorder("ac_decoder") state_decoder = state_decorder("state_decoder") #这个地方有问题 ac_de_ob = U.get_placeholder_cached(name="ac_de_ob") en_ob = U.get_placeholder_cached(name="en_ob") ##for encoder state_de_ob = U.get_placeholder_cached(name="state_de_ob") ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs ac_de_embedding = U.get_placeholder_cached(name="ac_de_embedding") ## for action decoder, 这个state decoder应该也是可以用的 state_de_embedding = U.get_placeholder_cached(name="state_de_embedding") # ac = ac_decoder.pdtype.sample_placeholder([None]) ob_next = tf.placeholder(name="ob_next", shape=[None, ob_shape], dtype=tf.float32) # ob_next_ac = tf.placeholder(name="ob_next_ac", shape=[ob_shape], dtype=tf.float32) # obs_out = state_decoder.pdtype.sample_placeholder([None]) # p(z) 标准正太分布 from common.distributions import make_pdtype p_z_pdtype = make_pdtype(embedding_shape) p_z_params = U.concatenate([tf.zeros(shape=[embedding_shape], name="mean"), tf.zeros(shape=[embedding_shape], name="logstd")], axis=-1) p_z = p_z_pdtype.pdfromflat(p_z_params) # recon_loss 里再加一个,对于action的 recon_loss = -tf.reduce_sum(state_decoder.pd.logp(ob_next)) # kl_loss = lstm_encoder.pd.kl(p_z)[0] ##p(z):标准正太分布, 这个看起来是不是也不太对!!!! # kl_loss = tf.maximum(lstm_encoder.pd.kl(p_z)[0], tf.constant(5.00)) ##p(z):标准正太分布, 这个看起来是不是也不太对!!!! kl_loss = lstm_encoder.pd.kl(p_z)[0] vae_loss = tf.reduce_mean(recon_loss + vae_beta * kl_loss) ###vae_loss 应该是一个batch的 ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"]) losses = [recon_loss, kl_loss, vae_loss] # 均方误差去训练 action,把得到的action step 一下,得到x(t+1),然后用均方误差loss,或者可以试试交叉熵 ## var_list var_list = [] en_var_list = lstm_encoder.get_trainable_variables() var_list.extend(en_var_list) # ac_de_var_list = ac_decoder.get_trainable_variables() # var_list.extend(ac_de_var_list) state_de_var_list = state_decoder.get_trainable_variables() var_list.extend(state_de_var_list) # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss) compute_losses = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], losses) compute_grad = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], U.flatgrad(vae_loss, var_list)) ###这里没有想好!!!,可能是不对的!! adam = MpiAdam(var_list, epsilon=adam_epsilon) U.initialize() adam.sync() writer = U.FileWriter(logdir) writer.add_graph(tf.get_default_graph()) # =========================== TRAINING ===================== # iters_so_far = 0 saver = tf.train.Saver(var_list=var_list, max_to_keep=100) saver_encoder = tf.train.Saver(var_list = en_var_list, max_to_keep=100) # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎 while iters_so_far < 50: ## 加多轮 logger.log("********** Iteration %i ************" % iters_so_far) ## 要不要每一轮调整一下batch_size recon_loss_buffer = deque(maxlen=100) # recon_loss2_buffer = deque(maxlen=100) kl_loss_buffer = deque(maxlen=100) vae_loss_buffer = deque(maxlen=100) # i = 0 for obs_and_next in dataset.get_next_batch(batch_size=time_steps): # print(i) # i += 1 observations = obs_and_next[0].transpose((1, 0))[:-1] ob_next = obs_and_next[0].transpose(1, 0)[state_decoder.receptive_field:, :] embedding_now = lstm_encoder.get_laten_vector(obs_and_next[0].transpose((1, 0))) embeddings = np.array([embedding_now for _ in range(time_steps - 1)]) embeddings_reshape = embeddings.reshape((time_steps-1, -1)) actions = ac_decoder.act(stochastic=True, ob=observations, embedding=embeddings_reshape) ob_next_ac = get_ob_next_ac(env, observations[-1], actions[0]) ##这个还需要再修改 #########################################3 # state_outputs = state_decoder.get_outputs(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1))) ##还没有加混合高斯......乱加了一通,已经加完了 # recon_loss = state_decoder.recon_loss(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1))) recon_loss, kl_loss, vae_loss = compute_losses(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1), observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next) g = compute_grad(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1), observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next) # logger.record_tabular("recon_loss", recon_loss) # logger.record_tabular("recon_loss2", recon_loss2) # logger.record_tabular("kl_loss", kl_loss) # logger.record_tabular("vae_loss", vae_loss) # logger.dump_tabular() adam.update(g, lr_rate) recon_loss_buffer.append(recon_loss) # recon_loss2_buffer.append(recon_loss2) kl_loss_buffer.append(kl_loss) vae_loss_buffer.append(vae_loss) ep_stats.add_all_summary(writer, [np.mean(recon_loss_buffer), np.mean(kl_loss_buffer), np.mean(vae_loss_buffer)], iters_so_far) logger.record_tabular("recon_loss", recon_loss) # logger.record_tabular("recon_loss2", recon_loss2) logger.record_tabular("kl_loss", kl_loss) logger.record_tabular("vae_loss", vae_loss) logger.dump_tabular() if(iters_so_far % 10 == 0 and iters_so_far != 0): save(saver=saver, sess=tf.get_default_session(), logdir=logdir, step=iters_so_far) save(saver=saver_encoder, sess=tf.get_default_session(),logdir="./vae_saver", step=iters_so_far) # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far) iters_so_far += 1 if iters_so_far < 6: lr_rate /= 2
def __init__( self, name, obs_shape, embedding_shape, batch_size, dilations, filter_width, residual_channels, ##32 dilation_channels, ## 32 skip_channels, quantization_channels=2**8, use_biases=False, scalar_input=False, initial_filter_width=obs_shape, histograms=False, global_condition_channels=None, #None global_condition_cardinality=None): '''Initializes the WaveNet model. Args: batch_size: How many audio files are supplied per batch (recommended: 1). dilations: A list with the dilation factor for each layer. filter_width: The samples that are included in each convolution, after dilating. ??????? residual_channels: How many filters to learn for the residual. dilation_channels: How many filters to learn for the dilated convolution. skip_channels: How many filters to learn that contribute to the quantized softmax output. quantization_channels: How many amplitude values to use for audio quantization and the corresponding one-hot encoding. Default: 256 (8-bit quantization). use_biases: Whether to add a bias layer to each convolution. Default: False. scalar_input: Whether to use the quantized waveform directly as input to the network instead of one-hot encoding it. Default: False. initial_filter_width: The width of the initial filter of the convolution applied to the scalar input. This is only relevant if scalar_input=True. histograms: Whether to store histograms in the summary. Default: False. global_condition_channels: Number of channels in (embedding size) of global conditioning vector. None indicates there is no global conditioning. global_condition_cardinality: Number of mutually exclusive categories to be embedded in global condition embedding. If not None, then this implies that global_condition tensor specifies an integer selecting which of the N global condition categories, where N = global_condition_cardinality. If None, then the global_condition tensor is regarded as a vector which must have dimension global_condition_channels. ''' with tf.variable_scope(name): #本行以及其下一行都是自己加的 self.scope = tf.get_variable_scope().name self.time_steps = batch_size self.dilations = dilations self.filter_width = filter_width self.residual_channels = residual_channels self.dilation_channels = dilation_channels self.quantization_channels = quantization_channels self.use_biases = use_biases self.skip_channels = skip_channels ##这个应该选成TRUE self.scalar_input = scalar_input self.initial_filter_width = initial_filter_width self.histograms = histograms self.global_condition_channels = global_condition_channels self.global_condition_cardinality = global_condition_cardinality self.receptive_field = WaveNetModel.calculate_receptive_field( self.filter_width, self.dilations, self.scalar_input, self.initial_filter_width) self.pdtype = pdtype = make_pdtype(obs_shape.shape[0]) self.pd = None self.variables = self._create_variables() # sequence_length = None # #ob = # #embedding = self._create_network(obs_shape=obs_shape, embedding_shape=embedding_shape)