def _init(self, obs_space, ac_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(ac_space.shape[0]) batch_size = None ob = U.get_placeholder(name="ac_de_ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="ac_de_embedding", dtype=tf.float32, shape=[batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到 # 正则化一下 last_out = U.concatenate([ob, embedding], axis=1) with tf.variable_scope("ac_de_filter"): self.ac_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) last_out = tf.clip_by_value( (last_out - self.ac_rms.mean) / self.ac_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size[i], "ac_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space.shape[0], int): self.mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "ac_de_final", U.normc_initializer(1.0)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "ac_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob, embedding], ac) self._get_pol_mean = U.function([ob, embedding], self.mean)
def _init(self, obs_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(obs_space.shape[0]) batch_size = None ob_input = U.get_placeholder(name="ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="embedding", dtype=tf.float32, shape=[ batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到这里的时候再处理 last_out = U.concatenate( [ob_input, embedding], axis=1) ##这里只有policy, 没有 value function, 还有这个要看看concatenate的对不对 # 正则化 with tf.variable_scope("state_de_filter"): self.state_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) input_z = tf.clip_by_value( (last_out - self.state_rms.mean) / self.state_rms.std, -5.0, 5.0) for i in range(num_hid_layers): input_z = tf.nn.tanh( U.dense(input_z, hid_size[i], "state_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(obs_space.shape[0], int): self.mean = U.dense(input_z, pdtype.param_shape()[0] // 2, "state_de_final", U.normc_initializer(0.01)) self.logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "state_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] self._act = U.function([ob_input, embedding], self.pd.sample()) self.get_mean = U.function([ob_input, embedding], self.mean)
def _init(self, obs_space, batch_size, time_steps, LSTM_size, laten_size, gaussian_fixed_var=True): ##等会儿要重点看一下var有没有更新 self.pdtype = pdtype = make_pdtype(laten_size) obs = U.get_placeholder("en_ob", dtype=tf.float32, shape = [batch_size, time_steps, obs_space.shape[0]]) # 正则化 with tf.variable_scope("obfilter"): ## 看看有没有起效果,我觉得是其效果考虑的 self.obs_rms = RunningMeanStd(shape=obs_space.shape) obz = tf.clip_by_value((obs - self.obs_rms.mean) / self.obs_rms.std, -5.0, 5.0) lstm_fw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) outputs, output_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, obz, dtype=tf.float32) outputs_average = tf.reduce_mean(outputs[0], axis=1) if gaussian_fixed_var and isinstance(laten_size, int): self.mean = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstmfin", U.normc_initializer(1.0)) self.logstd = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstm_logstd", U.normc_initializer(1.0)) # self.logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], # initializer=tf.constant_initializer(0.1)) ##这个地方是不是也是有问题的 pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(outputs_average, pdtype.param_shape()[0], "dblstmfin", U.normc_initializer(0.1)) self.pd = pdtype.pdfromflat(pdparam) self._encode = U.function([obs], self.pd.sample()) self._get_mean = U.function([obs], self.mean)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True, k=0.): assert isinstance(ob_space, gym.spaces.Box) self.k = k self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="termfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.5)) logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=U.normc_initializer(0.1), trainable=True) pdparam = tf.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="intfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.intfc = tf.sigmoid(tf.layers.dense(last_out, num_options, name="intfcfinal", kernel_initializer=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="OP%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax(tf.layers.dense(last_out, num_options, name="OPfinal", kernel_initializer=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, vae_pol_mean, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) + vae_pol_mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(0.1)) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _create_network(self): self.sess = U.get_session() self.inp_src = tf.placeholder(shape=[None, 1, self.inp_dim], dtype=tf.float32, name='input_src') self.inp_dest = tf.placeholder(shape=[None, 1, self.out_dim], dtype=tf.float32, name='input_dest') self.labels = tf.placeholder(shape=[None, self.seq_len, self.out_dim], dtype=tf.float32, name='label') self.src_seq_len = tf.placeholder(tf.int32, (None, ), name='source_sequence_length') self.tar_seq_len = tf.placeholder(tf.int32, (None, ), name='target_sequence_length') # running averages # with tf.variable_scope('goal_stats_src'): # self.goal_stats_src = Normalizer(self.inp_dim, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('goal_stats_dest'): self.goal_stats_dest = Normalizer(self.out_dim, self.norm_eps, self.norm_clip, sess=self.sess, PLN=True) # normalize inp_src, and goals labels inp_src = self.goal_stats_dest.normalize(self.inp_src) inp_dest = self.goal_stats_dest.normalize(self.inp_dest) goal_labels = self.goal_stats_dest.normalize(self.labels) with tf.variable_scope('goal_gen'): encoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, inp_src, sequence_length=self.src_seq_len, dtype=tf.float32) decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) project_layer = tf.layers.Dense(self.out_dim) with tf.variable_scope("decode"): train_inp = tf.concat([inp_dest, goal_labels[:, :-1, :]], axis=-2) train_helper = tf.contrib.seq2seq.TrainingHelper( train_inp, sequence_length=self.tar_seq_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, train_helper, encoder_state, output_layer=project_layer) train_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( train_decoder, maximum_iterations=self.seq_len) self.train_outputs = train_outputs.rnn_output with tf.variable_scope("decode", reuse=True): infer_helper = ContinousInferHelper(inp_dest[:, 0, :], self.tar_seq_len) infer_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, infer_helper, encoder_state, output_layer=project_layer) infer_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( infer_decoder, maximum_iterations=self.seq_len) self.infer_outputs = self.goal_stats_dest.denormalize( infer_outputs.rnn_output) log_sigma = tf.get_variable(name="logstd", shape=[1, self.out_dim], initializer=U.normc_initializer(0.1)) goals = train_outputs.rnn_output loss = 0.5 * tf.reduce_sum(tf.square((goal_labels - goals)/tf.exp(log_sigma)), axis=-1) \ + 0.5 * np.log(2*np.pi) * tf.to_float(tf.shape(self.labels)[-1]) \ + tf.reduce_sum(log_sigma, axis=-1) self.loss = tf.reduce_mean(loss) self.tr_outputs = self.goal_stats_dest.denormalize( self.train_outputs ) # just for inspect the correctness of training var_list = self._vars('') self.grads = U.flatgrad(self.loss, var_list) self.adam = MpiAdam(var_list, epsilon=self.adamepsilon) tf.variables_initializer(self._global_vars('')).run() self.adam.sync()
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # Critic Network with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = (ob - self.ob_rms.mean) / self.ob_rms.std last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] # Actor Network with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, model, hid_size, num_hid_layers, num_options=2, term_prob=0.5, eps=0.0005): assert isinstance(ob_space, gym.spaces.Box) self.state_in = [] self.state_out = [] self.term_prob = term_prob self.num_options = num_options # Creating the policy network sequence_length = None self.ac_dim = ac_space.shape[0] self.model = model self.eps = eps self.trained_options = [] ob = U.get_placeholder(name="ob", dtype=tf1.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf1.int32, shape=[None]) self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) with tf1.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf1.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz # Value function for i in range(num_hid_layers[0]): last_out = tf1.nn.tanh( tf1.layers.dense(last_out, hid_size[0], name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] # Intra option policy last_out = ob for i in range(num_hid_layers[1]): last_out = tf1.nn.tanh( tf1.layers.dense(last_out, hid_size[1], name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(-0.2)) logstd = tf1.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=U.normc_initializer(0.1), trainable=True) pdparam = tf1.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) # pdparam = dense3D2(last_out, pdtype.param_shape()[0], "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(-0.6)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf1.placeholder(dtype=tf1.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac]) self.get_vpred = U.function([ob, option], [self.vpred]) self.action_pd = U.function( [ob, option], [self.pd.mode(), self.pd.variance()])
def _create_network( self, obs_shape, embedding_shape): ## , input_batch, global_condition_batch '''Construct the WaveNet network.''' import common.tf_util as U outputs = [] sequence_length = 1 input_batch = U.get_placeholder( name="state_de_ob", dtype=tf.float32, shape=[batch_size, self.time_steps - 1, obs_shape.shape[0]]) ##input_batch是3D的 global_condition_batch = U.get_placeholder( name="state_de_embedding", dtype=tf.float32, shape=[batch_size, 1, embedding_shape]) current_layer = input_batch # Pre-process the input with a regular convolution current_layer = self._create_causal_layer(current_layer) ##这里不行 #output_width = tf.shape(input_batch)[1] - self.receptive_field + 1 output_width = input_batch.shape[1] - self.receptive_field + 1 # Add all defined dilation layers. with tf.name_scope('dilated_stack'): for layer_index, dilation in enumerate(self.dilations): with tf.name_scope('layer{}'.format(layer_index)): output, current_layer = self._create_dilation_layer( current_layer, layer_index, dilation, global_condition_batch, output_width) outputs.append(output) with tf.name_scope('postprocessing'): # Perform (+) -> ReLU -> 1x1 conv -> ReLU -> 1x1 conv to # postprocess the output. w1 = self.variables['postprocessing']['postprocess1'] w2 = self.variables['postprocessing']['postprocess2'] if self.use_biases: b1 = self.variables['postprocessing']['postprocess1_bias'] b2 = self.variables['postprocessing']['postprocess2_bias'] if self.histograms: tf.histogram_summary('postprocess1_weights', w1) tf.histogram_summary('postprocess2_weights', w2) if self.use_biases: tf.histogram_summary('postprocess1_biases', b1) tf.histogram_summary('postprocess2_biases', b2) # We skip connections from the outputs of each layer, adding them # all up here. total = sum(outputs) transformed1 = tf.nn.relu(total) conv1 = tf.nn.conv1d(transformed1, w1, stride=1, padding="SAME") if self.use_biases: conv1 = tf.add(conv1, b1) transformed2 = tf.nn.relu(conv1) conv2 = tf.nn.conv1d(transformed2, w2, stride=1, padding="SAME") if self.use_biases: conv2 = tf.add(conv2, b2) # print(conv2) # ========= add by myself =============== # # self.mean = tf.reduce_mean(conv2, axis=1) ###去均值作为每一个维度的 # self.logstd = tf.get_variable(name="wave_logstd", shape=[1, self.pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) # self.pd = self.pdtype.pdfromflat(pdparam) # # self._act = U.function([input_batch, global_condition_batch], [self.pd.sample()]) # # for debug # self.get_mean = U.function([input_batch, global_condition_batch], self.mean) conv2 = tf.reshape(conv2, [-1, self.quantization_channels]) self.mean = U.dense(conv2, 63, "wave_mean", U.normc_initializer(1.0)) ## 48 * 63 self.logstd = U.dense( conv2, 63, "wave_logstd", weight_init=U.normc_initializer(1.0)) ## 48 * 63 # self.logstd = tf.get_variable(name="wave_logstd", shape=[1, self.pdtype.param_shape()[0] // 2], # initializer=tf.zeros_initializer()) ## 这个地方的大小有待商榷 pdparm = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparm) # target_output = tf.slice(input_batch, [0, self.receptive_field, 0], [-1, -1, -1]) self._act = U.function([input_batch, global_condition_batch], [self.pd.sample()])