def _build_qnet(self): """ Build q-network """ with tf.variable_scope(self.scope): self.state_input = tf.placeholder(tf.float32, [None, self.state_size]) self.action = tf.placeholder(tf.int32, [None]) self.target_q = tf.placeholder(tf.float32, [None]) fc1 = tf_utils.fc(self.state_input, n_output=self.n_hidden_1, activation_fn=tf.nn.relu) fc2 = tf_utils.fc(fc1, n_output=self.n_hidden_2, activation_fn=tf.nn.relu) self.q_values = tf_utils.fc(fc2, self.action_size, activation_fn=None) action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0) q_value_pred = tf.reduce_sum(self.q_values * action_mask, 1) self.loss = tf.reduce_mean( tf.square(tf.subtract(self.target_q, q_value_pred))) self.optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = self.optimizer.minimize( self.loss, global_step=tf.contrib.framework.get_global_step())
def _build_network(self, name, conv): if conv: input_s = tf.placeholder(tf.float32, [None, self.width, self.height, 1]) with tf.variable_scope(name): conv1 = tf_utils.conv2d(input_s, 64, (3, 3), 1) conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1) conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1) reward = tf_utils.conv2d(conv3, 1, (1, 1), 1) theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, tf.squeeze(tf.reshape(reward, (-1, self.n_input))), theta else: input_s = tf.placeholder(tf.float32, [None, self.n_input]) with tf.variable_scope(name): fc1 = tf_utils.fc( input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) fc2 = tf_utils.fc( fc1, self.n_h2, scope="fc2", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) reward = tf_utils.fc(fc2, self.n_input, scope="reward") theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, tf.squeeze(reward), theta
def qnetwork(self): """ 创建Q network :return: """ with tf.variable_scope(self.name_scope): self.state_input = tf.placeholder(tf.float32, [None, self.state_size]) # 状态输入 self.action = tf.placeholder(tf.int32, [None]) # 动作输入 self.target_q = tf.placeholder(tf.float32, [None]) # target Q fc1 = tf_utils.fc(self.state_input, n_output=16, activation_fn=tf.nn.relu) fc2 = tf_utils.fc(fc1, n_output=32, activation_fn=tf.nn.relu) fc3 = tf_utils.fc(fc2, n_output=16, activation_fn=tf.nn.relu) self.q_values = tf_utils.fc(fc3, self.action_size, activation_fn=None) # 动作用one-hot编码 action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0) # 预测的q q_value_pred = tf.reduce_sum(self.q_values * action_mask, 1) # q network的loss self.loss = tf.reduce_mean( tf.square(tf.subtract(self.target_q, q_value_pred))) self.optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = self.optimizer.minimize(self.loss)
def _build_network(self, name): input_s = tf.placeholder(tf.float32, [None, self.state_size]) action = tf.placeholder(tf.float32, [None, self.action_size]) with tf.variable_scope(name): layer_1 = tf_utils.fc( input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) # tf.concat((layer_1, action), 1) layer_2 = tf_utils.fc( tf.concat((layer_1, action), 1), self.n_h2, scope="fc2", activation_fn=tf.nn.relu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) q_value = tf_utils.fc(layer_2, 1, scope="out", initializer=tf.random_uniform_initializer( -3e-3, 3e-3)) critic_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, action, critic_variables, tf.squeeze(q_value)
def _build_network(self, name): input_s = tf.placeholder(tf.float32, [None, self.n_input]) with tf.variable_scope(name): fc1 = tf_utils.fc(input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) fc2 = tf_utils.fc(fc1, self.n_h2, scope="fc2", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) reward = tf_utils.fc(fc2, 1, scope="reward") theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, reward, theta
def load_detection_model(ws): assert len(ws) == 8 data_blob = tf.placeholder(tf.float32) rois_blob = tf.placeholder(tf.float32, shape=[None, 5]) pool5, _ = roi_pooling_op.roi_pool(data_blob, rois_blob, 7, 7, 0.0625) flat_pool5 = tf.reshape(pool5, [-1, 25088]) fc6 = fc(flat_pool5, ws[0], ws[1]) fc7 = fc(fc6, ws[2], ws[3]) cls_prob = fc(fc7, ws[4], ws[5], 'softmax') bbox_pred = fc(fc7, ws[6], ws[7], 'linear') return data_blob, rois_blob, cls_prob, bbox_pred
def _build_network(self, name): input_s = tf.placeholder(tf.float32, [None, self.n_input]) img_in = tf.reshape(input_s, shape=[-1, 1, 4, 1]) with tf.variable_scope(name): cnv1 = tf_utils.conv2d(img_in, 2, (2,2)) fltn_conv = tf_utils.flatten(cnv1) # fc1 = tf_utils.fc(input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.elu, # initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) fc2 = tf_utils.fc(fltn_conv, self.n_h2, scope="fc2", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) reward = tf_utils.fc(fc2, 1, scope="reward") theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, reward, theta
def _build_network(self): with tf.variable_scope(self.scope): self.state_input = tf.placeholder(tf.float32, [None, self.state_size], name="state_input") self.action = tf.placeholder(tf.int32, [None], name="action") self.fc1 = tf_utils.fc( self.state_input, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) self.fc1_softmax = tf.nn.softmax(self.fc1, name="fc1_softmax") self.fc2 = tf_utils.fc( self.fc1, self.n_h2, scope="fc2", activation_fn=tf.nn.relu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) self.fc2_softmax = tf.nn.softmax(self.fc2, name="fc2_softmax") self.q_value = tf_utils.fc(self.fc2, self.action_size, scope="q_value", activation_fn=None) self.action_pred = tf.nn.softmax(self.q_value, name="action_prediction") self.action_target = tf.one_hot(self.action, self.action_size, on_value=1.0, off_value=0.0, name="action_target") self.loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_target, logits=self.action_pred, name="loss") #self.loss = tf.reduce_mean(tf.square(tf.subtract(self.action_pred, self.action_target))) self.optimizer = tf.train.AdamOptimizer(self.learning_rate, name="optimizer") self.train_op = self.optimizer.minimize( self.loss, global_step=tf.train.get_global_step(), name="train_op") new_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope) return new_variables
def _build_network(self, name): input_s = tf.placeholder(tf.float32, [None, self.n_input]) input_inv = tf.placeholder(tf.float32, [None, self.n_input]) img_in = tf.reshape(input_s, shape=[-1, 1, 4, 1]) img_inv = tf.reshape(input_inv, shape=[-1, 1, 4, 1]) with tf.variable_scope(name): cnv1 = tf_utils.conv2d(img_in, 2, (2, 2)) # max_conv_p = tf_utils.max_pool(cnv1_p) fltn_conv = tf_utils.flatten(cnv1) fc1 = tf_utils.fc( fltn_conv, self.n_h2, scope="fc1", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) cnv1_inv = tf_utils.conv2d(img_inv, 2, (2, 2)) # max_conv_p = tf_utils.max_pool(cnv1_p) fltn_conv_inv = tf_utils.flatten(cnv1_inv) fc1_inv = tf_utils.fc( fltn_conv_inv, self.n_h2, scope="fc1_inv", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) subt = tf.subtract(fc1, fc1_inv) # blah = tf.multiply(tf.divide(fc2, fc1_p), 0.35) # comb = tf.concat([fc1, fc1_inv], 1) fc_p1 = tf_utils.fc( subt, 2 * self.n_h1, scope="fc_p1", activation_fn=tf.nn.elu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) # fc_p2 = tf_utils.fc(fc_p1, self.n_h2, scope="fc_p2", activation_fn=tf.nn.elu, # initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) reward = tf_utils.fc(fc_p1, 1, scope="reward") theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, input_inv, reward, theta
def _build_network(self, name): input_s = tf.placeholder(tf.float32, [None, self.state_size]) input_a = tf.placeholder(tf.int32, [None]) advantage = tf.placeholder(tf.float32, [None]) target_v = tf.placeholder(tf.float32, [None]) with tf.variable_scope(name): layer_1 = tf_utils.fc( input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) layer_2 = tf_utils.fc( layer_1, self.n_h2, scope="fc2", activation_fn=tf.nn.relu, initializer=tf.contrib.layers.variance_scaling_initializer( mode="FAN_IN")) policy = tf_utils.fc( layer_2, self.action_size, activation_fn=tf.nn.softmax, scope="policy", initializer=tf_utils.normalized_columns_initializer(0.01)) value = tf_utils.fc( layer_2, 1, activation_fn=None, scope="value", initializer=tf_utils.normalized_columns_initializer(1.0)) action_mask = tf.one_hot(input_a, self.action_size, 1.0, 0.0) action_est = tf.reduce_sum(policy * action_mask, 1) model_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, input_a, advantage, target_v, policy, value, action_est, model_variables
def _build_policy_net(self): """Build policy network""" with tf.variable_scope(self.scope): self.state_input = tf.placeholder(tf.float32, [None, self.state_size]) self.action = tf.placeholder(tf.int32, [None]) self.target = tf.placeholder(tf.float32, [None]) layer_1 = tf_utils.fc(self.state_input, self.n_hidden_1, tf.nn.relu) layer_2 = tf_utils.fc(layer_1, self.n_hidden_2, tf.nn.relu) self.action_values = tf_utils.fc(layer_2, self.action_size) action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0) self.action_prob = tf.nn.softmax(self.action_values) self.action_value_pred = tf.reduce_sum(self.action_prob * action_mask, 1) # l2 regularization self.l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() ]) self.pg_loss = tf.reduce_mean(-tf.log(self.action_value_pred) * self.target) self.loss = self.pg_loss + 0.002 * self.l2_loss self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
def _build_model(self): self._observation = tf.placeholder(tf.float32, [None, self.state_size]) self._action_target = tf.placeholder(tf.int32, [None], name='action_target') self._q_target = tf.placeholder(tf.float32, [None], name='q_value_target') with tf.variable_scope('deepq_model'): _hidden = fc(self._observation, h_size=24, name='fc_input', act=tf.nn.relu) for idx in range(2): _hidden = fc(_hidden, h_size=24, name='fc' + str(idx), act=tf.nn.relu) self._q_hat = fc(_hidden, h_size=self.action_size, name='fc', act=None) # turn (0..2) into 1-hot encoding _action_one_hot = tf.one_hot(self._action_target, self.action_size, 1.0, 0.0, name='action_target_one_hot') # values collected following action_target _q_acted = tf.reduce_sum(self._q_hat * _action_one_hot, reduction_indices=1, name='q_hat') _delta = self._q_target - _q_acted self._loss = tf.reduce_mean(tf.square(_delta)) self._train_op = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self._loss)
def RelationNetwork(encoder, hidden_size, trainable=True): x = encoder with tf.variable_scope('RelationNetwork') as scope: with tf.variable_scope('layer1'): x = utils.conv2d(x, name='conv1', shape=[3, 3, 128, 64], padding='SAME', activation_func=tf.nn.relu, trainable=trainable, use_bn=True) x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') with tf.variable_scope('layer2'): x = utils.conv2d(x, name='conv1', shape=[3, 3, 64, 64], padding='SAME', activation_func=tf.nn.relu, trainable=trainable, use_bn=True) x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') with tf.variable_scope('fc1'): x = utils.fc(x, num_out=hidden_size, name='fc1', activation_func=tf.nn.relu) with tf.variable_scope('fc2'): x = utils.fc(x, num_out=1, name='fc2', activation_func=None) return x
def build_model(net): # Construct model.. X = tf.placeholder("float", [BATCH, T_in, IMG_H, IMG_W, IMG_CH]) Y = tf.placeholder("float", [BATCH, T_pred, IMG_H, IMG_W, IMG_CH]) # Flatten the images going in s.t. BATCH * T_in, height, width, ch.. X_flat = tf.reshape(X, [BATCH * T_in, IMG_H, IMG_W, IMG_CH]) conv1 = conv2d(X_flat, net.weights['wc1'], net.biases['bc1'], 2) conv2 = conv2d(conv1, net.weights['wc2'], net.biases['bc2'], 2) conv3 = conv2d(conv2, net.weights['wc3'], net.biases['bc3'], 2) # Now we hyperflatten everything for the lstm: BATCH, T-in, everything. res = tf.reshape(conv3, [BATCH, T_in, -1]) prediction = net.EncoderDecoder(res) # Infer on BATCH * T_pred, everything. fc_out = fc(prediction, net.weights['wfc1'], net.biases['bfc1']) # Reshape to on BATCH, T_pred, IMG_H * IMG_W * IMG_CH. fc_out = tf.reshape(fc_out, [BATCH, T_pred, IMG_H * IMG_W * IMG_CH]) sig_out = tf.sigmoid(fc_out) # Calculate difference.. Y_flat = tf.reshape(Y, [BATCH, T_pred, IMG_H * IMG_W * IMG_CH]) diff = fc_out - Y_flat # Compute loss... vs = tf.trainable_variables() lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vs if 'bias' not in v.name ]) * 0.001 loss_op = tf.reduce_sum(tf.reduce_sum(diff * diff, axis=2), axis=1) + lossL2 loss_op = tf.reduce_mean(loss_op) train_op = tf.train.AdamOptimizer(learning_rate=LR).minimize(loss_op) return fc_out, sig_out, X, Y, loss_op, train_op
def _build_network(self, name, agent_class): # input_s = tf.placeholder(tf.float32, [None, self.state_size]) input_a = tf.placeholder(tf.int32, [None]) advantage = tf.placeholder(tf.float32, [None]) target_v = tf.placeholder(tf.float32, [None]) with tf.variable_scope(name): # layer_1 = tf_utils.fc( # input_s, # self.n_h1, # scope="fc1", # activation_fn=tf.nn.relu, # initializer=tf.contrib.layers.variance_scaling_initializer( # mode="FAN_IN")) # layer_2 = tf_utils.fc( # layer_1, # self.n_h2, # scope="fc2", # activation_fn=tf.nn.relu, # initializer=tf.contrib.layers.variance_scaling_initializer( # mode="FAN_IN")) # policy = tf_utils.fc( # layer_2, # self.action_size, # activation_fn=tf.nn.softmax, # scope="policy", # initializer=tf_utils.normalized_columns_initializer(0.01)) + 1e-8 # value = tf_utils.fc(layer_2, 1, activation_fn=None, # scope="value", initializer=tf_utils.normalized_columns_initializer(1.0)) self.agent = agent_class( simulate_steps=self.SIM_STEPS, max_bp_steps=self.BP_STEPS, mult_fac=self.MULT_FAC, discount_factor=1, scope=self.net_scope_name, # goal_position=self.env_args['goal_position'], # disappearance_probability=self.env_args['disappearance_probability'], # sequential=self.sequential ) # Extract Policy and value nodes input_s = self.agent.init_state_pl final_action_belief = self.agent.final_action_belief * Temperature self.final_state = self.agent.final_state if LAYER_OVER_POLICY: policy = tf_utils.fc( final_action_belief, self.action_size, activation_fn=tf.nn.softmax, scope="policy", initializer=tf_utils.normalized_columns_initializer(0.01) ) + 1e-8 else: policy = tf.nn.softmax(final_action_belief)[0] + 1e-8 # input_s1, input_s2 = tf.split(input_s, [30, 2], axis=1) # input_s_new = tf.expand_dims(input_s1, 2) + tf.expand_dims(input_s2, 1) input_s_new = input_s input_s_new = tf.exp(input_s_new) input_s_new = tcl.flatten(input_s_new) # layer1 = tf_utils.fc( # input_s_new, # 300, # scope="fc1", # activation_fn=tf.nn.relu, # initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN") # ) # value = tf_utils.fc(layer1, 1, activation_fn=None, scope="value", initializer=tf_utils.normalized_columns_initializer(1.0)) value = self._create_value_network(input_s_new) action_mask = tf.one_hot(input_a, self.action_size, 1.0, 0.0) action_est = tf.reduce_sum(policy * action_mask, 1) model_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, input_a, advantage, target_v, policy, value, action_est, model_variables