def build_net(minimap, screen, info, msize, ssize, num_action): # Extract features screen_filters1 = tf.get_variable(name='sf1', shape=(5, 5, U.screen_channel(), 16)) # hwio screen_filters2 = tf.get_variable(name='sf2',shape=(3, 3, 16, 32)) minimap_filters1 = tf.get_variable(name='mmf1',shape=(5, 5, U.minimap_channel(), 16)) minimap_filters2 = tf.get_variable(name='mmf2',shape=(3, 3, 16, 32)) mconv1 = tf.nn.conv2d(tf.transpose(minimap, [0, 2, 3, 1]), minimap_filters1, strides=[1, 1, 1, 1], padding='SAME', name='mconv1') mconv2 = tf.nn.conv2d(mconv1, minimap_filters2, strides=[1, 1, 1, 1], padding='SAME', name='mconv2') sconv1 = tf.nn.conv2d(tf.transpose(screen, [0, 2, 3, 1]), screen_filters1, strides=[1, 1, 1, 1], padding='SAME', name='sconv1') sconv2 = tf.nn.conv2d(sconv1, screen_filters2, strides=[1, 1, 1, 1], padding='SAME', name='sconv2') info_fc = layers.fully_connected(layers.flatten(info), num_outputs=256, activation_fn=tf.tanh, scope='info_fc') # Compute spatial actions feat_conv = tf.concat([mconv2, sconv2], axis=3) spatial_weights = tf.get_variable(name='spatial_weights', shape=(1, 1, feat_conv.get_shape()[-1], 1)) spatial_action = tf.nn.conv2d(feat_conv, spatial_weights, strides=[1, 1, 1 ,1], padding='SAME', name='spatial_action') spatial_action = tf.nn.softmax(layers.flatten(spatial_action)) # Compute non spatial actions and value feat_fc = tf.concat([layers.flatten(mconv2), layers.flatten(sconv2), info_fc], axis=1) feat_fc = layers.fully_connected(feat_fc, num_outputs=256, activation_fn=tf.nn.relu, scope='feat_fc') non_spatial_action = layers.fully_connected(feat_fc, num_outputs=num_action, activation_fn=tf.nn.softmax, scope='non_spatial_action') value = tf.reshape(layers.fully_connected(feat_fc, num_outputs=1, activation_fn=None, scope='value'), [-1]) return spatial_action, non_spatial_action, value
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype) self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = - tf.reduce_mean(action_log_prob * advantage) value_loss = - tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # TODO: policy penalty loss = policy_loss + value_loss # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def main(unused_argv): #config = tf.ConfigProto(allow_soft_placement=True) #print(type(config)) #print(config.gpu_options) print(type(actions.FUNCTIONS)) print(len(actions.FUNCTIONS)) print(len(features.MINIMAP_FEATURES)) print('minimap channel is', U.minimap_channel()) print('screen channel is', U.screen_channel()) print('player id index in minimap features is', features.MINIMAP_FEATURES.player_id.index)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype) self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1), 1e-10, 1.) non_spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected * self.valid_non_spatial_action, axis=1), 1e-10, 1.) q_value = spatial_action_prob * self.valid_spatial_action * self.ispatial + non_spatial_action_prob self.delta = self.value_target - q_value #self.clipped_error = tf.where(tf.abs(self.delta) < 1.0, 0.5 * tf.square(self.delta), tf.abs(self.delta) - 0.5, name='clipped_error') #value_loss = tf.reduce_mean(self.clipped_error, name='value_loss') value_loss = tf.reduce_mean(tf.square(self.delta)) self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(value_loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) grad = grad if grad is not None else tf.zeros_like(var) self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def buildNetwork(self): self.minimap = tf.placeholder( tf.float32, [None, minimap_channel(), self.minimap_size, self.minimap_size], name="minimap") self.screen = tf.placeholder( tf.float32, [None, screen_channel(), self.screen_size, self.screen_size], name="screen") self.info = tf.placeholder(tf.float32, [None, self.action_size], name="info") self.spatial_action, self.non_spatial_action, self.value = network( self.minimap, self.screen, self.info, self.minimap_size, self.screen_size, self.action_size) # Set targets and masks self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.screen_size**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) tf.summary.histogram('spatial_action_prob', spatial_action_prob) tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('value_loss', value_loss) # TODO: policy penalty self.reg = tf.squeeze(self.non_spatial_action) self.reg = tf.reduce_sum(self.reg * tf.log(self.reg)) loss = policy_loss + value_loss + self.reg # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') rmsprop = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, decay=0.99, epsilon=1e-10) self.train_op = rmsprop.minimize(loss) self.merged = tf.summary.merge_all() self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build a3c base networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype, reuse=False) self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) # Compute a3closses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # TODO: policy penalty a3c_loss = policy_loss + value_loss #pc_part_start self.pc_minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='pc_minimap') self.pc_screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='pc_screen') self.pc_info = tf.placeholder(tf.float32, [None, self.isize], name='info') self.pc_valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='pc_valid_non_spatial_action') pc_net = build_pc_net(self.pc_minimap, self.pc_screen, self.pc_info, self.msize, self.ssize, len(actions.FUNCTIONS), self.pc_valid_non_spatial_action) pc_q, pc_q_max = pc_net pc_a = tf.placeholder("float", [None, len(actions.FUNCTIONS)]) pc_a_reshaped = tf.reshape( self.pc_a, [-1, 1, 1, len(actions.FUNCTIONS)]) # Extract Q for taken action pc_qa_ = tf.multiply(self.pc_q, pc_a_reshaped) pc_qa = tf.reduce_sum(pc_qa_, reduction_indices=3, keep_dims=False) # (-1, 20, 20) # TD target for Q self.pc_r = tf.placeholder("float", [None, 20, 20]) pc_loss = self._pixel_change_lambda * tf.nn.l2_loss(self.pc_r - pc_qa) # Build the optimizer loss = pc_loss + a3c_loss self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # create master and subpolicies self.subpolicy_Q = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, num_units + 2, 'master_policy') # Set targets and masks for master policy update self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') self.action_input = tf.placeholder("float", [None, num_units + 2]) self.y_input = tf.placeholder("float", [None]) self.Q_action = tf.reduce_sum(tf.multiply(self.subpolicy_Q, self.action_input), reduction_indices=1) self.cost = tf.reduce_mean(tf.square(self.y_input - self.Q_action)) self.master_train_op = tf.train.AdamOptimizer( self.learning_rate).minimize(self.cost) # Set targets and masks for subpolicies update self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action_') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action_') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected_') self.value_target = tf.placeholder(tf.float32, [None], name='value_target_') # Build the optimizer opt = tf.train.AdamOptimizer(self.learning_rate) self.subpolicy = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), 'fcn') self.spatial_action, self.non_spatial_action, self.value = self.subpolicy # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('spatial_action_prob_', spatial_action_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_prob_', non_spatial_action_prob)) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss_', policy_loss)) self.summary.append(tf.summary.scalar('value_loss_', value_loss)) # TODO: policy penalty loss = policy_loss + value_loss grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: # get around of master policy gradients if grad is None: continue self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: #比如训练模式下4线程,除了第一个build_model的reuse是False以外,其他的均为True(main文件 124行) tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks 网络输入量为以下3项 self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') # TODO: self.info = tf.placeholder( tf.float32, [None, self.isize + self.info_plus_size], name='info') self.dir_high_usedToFeedLowNet = tf.placeholder( tf.float32, [1, 1], name='dir_high_usedToFeedLowNet') self.act_id = tf.placeholder(tf.float32, [1, 1], name='act_id') # Build networks # net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype) # build_net函数从network.py中引入 # self.spatial_action, self.non_spatial_action, self.value = net # 可见,build_model中建立的3个网络都是同样的结构 # DHN add: self.dir_high, self.value_high, self.a_params_high, self.c_params_high = build_high_net( self.minimap, self.screen, self.info, num_macro_action) self.spatial_action_low, self.value_low, self.a_params_low, self.c_params_low = build_low_net( self.minimap, self.screen, self.info, self.dir_high_usedToFeedLowNet, self.act_id) # Set targets and masks # self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action') # self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected') # self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') # self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') # self.value_target = tf.placeholder(tf.float32, [None], name='value_target') #value_target是v现实,是算完以后传进来的(219行),和莫烦A3C一致(莫烦A3C中56和154行) #DHN add: self.valid_spatial_action_low = tf.placeholder( tf.float32, [None], name='valid_spatial_action_low') self.spatial_action_selected_low = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected_low') self.value_target_low = tf.placeholder(tf.float32, [None], name='value_target_low') self.value_target_high = tf.placeholder(tf.float32, [None], name='value_target_high') self.dir_high_selected = tf.placeholder(tf.float32, [None, num_macro_action], name='dir_high_selected') # Compute log probability spatial_action_prob_low = tf.reduce_sum( self.spatial_action_low * self.spatial_action_selected_low, axis=1) # 用法可以参考Matrix_dot-multiply.py # spatial_action是网络输出的坐标,维度是“更新时历经的step数” x “ssize**2” # spatial_action_selected含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)。spatial_action_selected维度是“更新时历经的step数” x “ssize**2” spatial_action_log_prob_low = tf.log( tf.clip_by_value(spatial_action_prob_low, 1e-10, 1.)) # 维度是“更新时历经的step数” dir_prob_high = tf.reduce_sum(self.dir_high * self.dir_high_selected, axis=1) dir_log_prob_high = tf.log( tf.clip_by_value(dir_prob_high, 1e-10, 1.)) # non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1) # valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1) # valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.) # non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob # non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary_low.append( tf.summary.histogram('spatial_action_prob_low', spatial_action_prob_low)) self.summary_high.append( tf.summary.histogram('dir_prob_high', dir_prob_high)) # self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # 计算网络的a、c loss(这里主要参考莫烦a3c的discrete action程序) # 下层网络: td_low = tf.subtract(self.value_target_low, self.value_low, name='TD_error_low') self.c_loss_low = tf.reduce_mean(tf.square(td_low)) log_prob_low = self.valid_spatial_action_low * spatial_action_log_prob_low # valid_spatial_action含义是每一个step需不需要坐标参数,维度是“更新时历经的step数” self.exp_v_low = log_prob_low * tf.stop_gradient(td_low) self.a_loss_low = -tf.reduce_mean( self.exp_v_low ) # 这里不需要像莫烦那样添加一步“增加探索度的操作”了,因为在step_low里已经设置了增加探索度的操作 # 上层网络: td_high = tf.subtract(self.value_target_high, self.value_high, name='TD_error_low') self.c_loss_high = tf.reduce_mean(tf.square(td_high)) self.exp_v_high = dir_log_prob_high * tf.stop_gradient(td_high) self.a_loss_high = -tf.reduce_mean( self.exp_v_high ) # 这里不需要epsilon greedy增加探索度了(像莫烦那样),因为在step_low里已经设置了增加探索度的操作 # 添加summary: self.summary_low.append( tf.summary.scalar('a_loss_low', self.a_loss_low)) self.summary_low.append( tf.summary.scalar('c_loss_low', self.c_loss_low)) self.summary_high.append( tf.summary.scalar('a_loss_high', self.a_loss_high)) self.summary_high.append( tf.summary.scalar('c_loss_high', self.c_loss_high)) # TODO: policy penalty # loss = policy_loss + value_loss # Build the optimizer # self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') # opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) # grads = opt.compute_gradients(loss) # cliped_grad = [] # for grad, var in grads: # self.summary.append(tf.summary.histogram(var.op.name, var)) # self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad)) # grad = tf.clip_by_norm(grad, 10.0) # cliped_grad.append([grad, var]) # self.train_op = opt.apply_gradients(cliped_grad) # 根据梯度进行更新(这里主要参考莫烦a3c的continuous action程序) # 下层网络: self.learning_rate_a_low = tf.placeholder( tf.float32, None, name='learning_rate_a_low') opt_a_low = tf.train.RMSPropOptimizer(self.learning_rate_a_low, decay=0.99, epsilon=1e-10) self.a_grads_low = tf.gradients(self.a_loss_low, self.a_params_low) self.update_a_low = opt_a_low.apply_gradients( zip(self.a_grads_low, self.a_params_low)) self.learning_rate_c_low = tf.placeholder( tf.float32, None, name='learning_rate_c_low') opt_c_low = tf.train.RMSPropOptimizer(self.learning_rate_c_low, decay=0.99, epsilon=1e-10) self.c_grads_low = tf.gradients(self.c_loss_low, self.c_params_low) self.update_c_low = opt_c_low.apply_gradients( zip(self.c_grads_low, self.c_params_low)) # 上层网络: self.learning_rate_a_high = tf.placeholder( tf.float32, None, name='learning_rate_a_high') opt_a_high = tf.train.RMSPropOptimizer(self.learning_rate_a_high, decay=0.99, epsilon=1e-10) self.a_grads_high = tf.gradients(self.a_loss_high, self.a_params_high) self.update_a_high = opt_a_high.apply_gradients( zip(self.a_grads_high, self.a_params_high)) self.learning_rate_c_high = tf.placeholder( tf.float32, None, name='learning_rate_c_high') opt_c_high = tf.train.RMSPropOptimizer(self.learning_rate_c_high, decay=0.99, epsilon=1e-10) self.c_grads_high = tf.gradients(self.c_loss_high, self.c_params_high) self.update_c_high = opt_c_high.apply_gradients( zip(self.c_grads_high, self.c_params_high)) self.summary_op_low = tf.summary.merge(self.summary_low) self.summary_op_high = tf.summary.merge(self.summary_high) self.saver = tf.train.Saver( max_to_keep=100 ) # 定义self.saver 为 tf的存储器Saver(),在save_model和load_model函数里使用
def build_model(self, reuse, device): with tf.variable_scope(self.name) and tf.device(device): if reuse: tf.get_variable_scope().reuse_variables() # placeholder for inputs of network self.screen_ph = tf.placeholder(tf.float32, [ None, U.screen_channel(), self.screen_dimensions, self.screen_dimensions ], name='screen') self.minimap_ph = tf.placeholder(tf.float32, [ None, U.minimap_channel(), self.minimap_dimensions, self.minimap_dimensions ], name='minimap') self.structured_ph = tf.placeholder( tf.float32, [None, self.structured_dimensions], name='structured') # build network network = build_network(self.structured_ph, self.screen_ph, self.minimap_ph, len(actions.FUNCTIONS)) self.non_spatial_action, self.spatial_action, self.value = network # placeholder for targets and masks self.valid_non_spatial_action_ph = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.sample_non_spatial_action_ph = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='sample_non_spatial_action') self.valid_spatial_action_ph = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.sample_spatial_action_ph = tf.placeholder( tf.float32, [None, self.minimap_dimensions**2], name='sample_spatial_action') self.target_value_ph = tf.placeholder(tf.float32, [None], name='target_value') # compute log probability valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action_ph, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.sample_non_spatial_action_ph, axis=1) non_spatial_action_prob /= valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) spatial_action_prob = tf.reduce_sum(self.spatial_action * self.sample_spatial_action_ph, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) self.summary.append( tf.summary.histogram('spatial_action_prob', spatial_action_prob)) # compute loss action_log_prob = self.valid_spatial_action_ph * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.target_value_ph - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) loss = policy_loss + value_loss self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # optimizer self.learning_rate_ph = tf.placeholder(tf.float32, None, name='learning_rate') optimizer = tf.train.RMSPropOptimizer(self.learning_rate_ph, decay=0.99, epsilon=1e-10) grads = optimizer.compute_gradients(loss) clipped_grads = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) clipped_grads.append([grad, var]) self.train_op = optimizer.apply_gradients(clipped_grads) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=None)
def build(self, reuse, dev): # chaging this around for now. need a2c to work first with tf.variable_scope( self.name) and tf.device(dev): # A3C/A3CAgent/var_name if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.score = tf.placeholder(tf.int32, [], name='score') self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = networks.build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS)) # will below give me nonetype error too? if so it is b/c arguments and/or argument_policy 100% self.spatial_policy, self.non_spatial_policy, self.state_representation, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, self.isize], name='valid_non_spatial_action' ) # these match w. actions in previous self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, self.isize], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability -- what do these look like exactly? spatial_action_prob = tf.reduce_sum(self.spatial_policy * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_policy * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_policy * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) self.summary.append( tf.summary.histogram('spatial_action_log_prob', spatial_action_log_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_log_prob', non_spatial_action_log_prob)) # self.logger.info(f"non_spatial_action_log_prob: {non_spatial_action_log_prob}") # self.logger.info(f"spatial_action_selected: {self.spatial_action_selected}") # how does spatial_action_selected look, probs? # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = tf.reduce_mean(self.value * advantage) # entropy_loss = tf.reduce_mean(entropy) * self.entropy_regularisation entropy_loss = tf.reduce_sum(self.non_spatial_policy * tf.log(self.non_spatial_policy), name='entropy') # reduce_sum or mean? self.summary.append(tf.summary.scalar("entropy_loss", entropy_loss)) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # TODO: policy penalty/entropy # loss = policy_loss + value_loss total_loss = policy_loss + (value_loss * self.value_regularisation ) - (entropy_loss * self.entropy_regularisation) self.summary.append(tf.summary.scalar("total_loss", total_loss)) # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(total_loss) cliped_grad = [] for grad, var in grads: self.logger.info(f"CHECK1: {var}") self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm( grad, 10.0) # is this an ideal value to clip with? cliped_grad.append([grad, var]) self.logger.info(f"CHECK2: {var}") self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=10) self.argument_policy = dict() self.arguments = dict() for arg_type in actions.TYPES: # for spatial actions, represent each dimension independently # what if instead of making the output units, i make it a smaller number # then do something similar to what eps greedy is doing now? if len(arg_type.sizes) > 1: # if spatial. *** if arg_type in SCREEN_TYPES: units = self.ssize elif arg_type in MINIMAP_TYPES: units = self.msize arg_policy_x = layers.fully_connected( self.state_representation, num_outputs=units, activation_fn=tf.nn.softmax) arg_policy_y = layers.fully_connected( self.state_representation, num_outputs=units, activation_fn=tf.nn.softmax) self.argument_policy[str(arg_type) + "x"] = arg_policy_x self.argument_policy[str(arg_type) + "y"] = arg_policy_y arg_placeholder_x = tf.placeholder(tf.float32, shape=[None, units]) arg_placeholder_y = tf.placeholder(tf.float32, shape=[None, units]) self.arguments[str(arg_type) + "x"] = arg_placeholder_x self.arguments[str(arg_type) + "y"] = arg_placeholder_y else: # if non spatial arg_policy = layers.fully_connected( self.state_representation, num_outputs=arg_type.sizes[0], activation_fn=tf.nn.softmax) self.argument_policy[str(arg_type)] = arg_policy arg_placeholder = tf.placeholder( tf.float32, shape=[None, arg_type.sizes[0]]) self.arguments[str(arg_type)] = arg_placeholder
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import tensorflow.contrib.layers as layers import utils as U screen_filters1 = tf.get_variable(name='sf1', shape=(5, 5, U.screen_channel(), 16)) # hwio screen_filters2 = tf.get_variable(name='sf2', shape=(3, 3, 16, 32)) minimap_filters1 = tf.get_variable(name='mmf1', shape=(5, 5, U.minimap_channel(), 16)) minimap_filters2 = tf.get_variable(name='mmf2', shape=(3, 3, 16, 32)) def build_net(minimap, screen, info, msize, ssize, num_action): # Extract features mconv1 = tf.nn.conv2d(tf.transpose(minimap, [0, 2, 3, 1]), minimap_filters1, strides=[1, 1, 1, 1], padding='SAME', name='mconv1') mconv2 = tf.nn.conv2d(mconv1, minimap_filters2, strides=[1, 1, 1, 1], padding='SAME', name='mconv2') sconv1 = tf.nn.conv2d(tf.transpose(screen, [0, 2, 3, 1]), screen_filters1,
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import tensorflow.contrib.layers as layers import utils as U screen_filters1 = tf.get_variable(name='screen_f1', shape=(5, 5, U.screen_channel(), 16)) # hwio screen_filters2 = tf.get_variable(name='screen_f2', shape=(3, 3, 16, 32)) minimap_filters1 = tf.get_variable(name='minimap_f1', shape=(5, 5, U.minimap_channel(), 16)) minimap_filters2 = tf.get_variable(name='minimap_f2', shape=(3, 3, 16, 32)) def build_net(minimap, screen, info, msize, ssize, num_action): # Extract features mconv1 = tf.nn.conv2d(tf.transpose(minimap, [0, 2, 3, 1]), minimap_filters1, strides=[1, 1, 1, 1], padding='SAME', name='mconv1') mconv2 = tf.nn.conv2d(mconv1, minimap_filters2, strides=[1, 1, 1, 1], padding='SAME', name='mconv2') sconv1 = tf.nn.conv2d(tf.transpose(screen, [0, 2, 3, 1]), screen_filters1, strides=[1, 1, 1, 1], padding='SAME', name='sconv1') sconv2 = tf.nn.conv2d(sconv1, screen_filters2, strides=[1, 1, 1, 1], padding='SAME', name='sconv2') info_fc = layers.fully_connected(layers.flatten(info), num_outputs=256, activation_fn=tf.tanh, scope='info_fc') # Compute spatial actions feat_conv = tf.concat([mconv2, sconv2], axis=3) spatial_weights = tf.get_variable(name='spatial_weights', shape=(1, 1, feat_conv.get_shape()[-1], 1)) spatial_action = tf.nn.conv2d(feat_conv, spatial_weights, strides=[1, 1, 1 ,1], padding='SAME', name='spatial_action') spatial_action = tf.nn.softmax(layers.flatten(spatial_action))
def build_model(self, reuse, dev, ntype): """ Build the TensorFlow model for the MLSH agent on this thread - valid_spatial_action: shape (len(rbs),) = whether agent took a spatial action or not at each step of replay buffer - spatial_action_selected: shape (len(rbs), screensize**2) = one-hot encoding of (x,y) argts of action at each step replay buffer - valid_non_spatial_action: shape (len(rbs), len(actions.FUNCTIONS)) = one-hot encoding of available actions at each step of replay buffer - non_spatial_action_selected: shape (len(rbs), len(actions.FUNCTIONS)) = one-hot encoding of the action taken at each step of replay buffer """ with tf.variable_scope(self.name) and tf.device(dev): # Set inputs of networks self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype, self.num_subpol, reuse, self.num_thread) self.spatial_actions, self.non_spatial_actions, self.value, self.master_value, self.subpol_choice, self.master_vars = net # Create training operation for the subpolicies: # Set targets and masks self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') self.subpol_train_ops = [] # Variables kept by training operations (such as gradients) are given scope 'train_vars' with tf.variable_scope('train_vars'): # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) for pol_id in range(self.num_subpol): self.build_subpolicy(opt, pol_id, reuse) # Create training operation for the master policy: self.build_master_policy(opt) # Log scores and decisions to tensorboard: self.summary_op = tf.summary.merge(self.summary) self.subpol_summary_op = tf.summary.merge(self.subpol_summary) self.train_score = tf.placeholder(tf.float32, name='train_score') self.train_score_summary_op = tf.summary.scalar( 'train_score_thread_' + str(self.num_subpol), self.train_score) self.test_score = tf.placeholder(tf.float32, name='test_score') self.test_score_summary_op = tf.summary.scalar( 'test_score_thread_' + str(self.num_subpol), self.test_score) self.ep_subpol_choices_ph = tf.placeholder( tf.float32, [None], name='subpol_choices') self.ep_subpol_choices_op = tf.summary.histogram( 'subpol_choices', self.ep_subpol_choices_ph) self.saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=1)
logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) file_handler = logging.FileHandler("C:/Users/lbianculli/action_and_id_log", mode="w") file_handler.setLevel(logging.INFO) formatter = logging.Formatter('%(levelname)s - %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) isize = 11 msize = 64 ssize = 64 score = tf.placeholder(tf.int32, [], name='score') minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), msize, msize], name='minimap') # 17, 64, 64 screen = tf.placeholder(tf.float32, [None, U.screen_channel(), ssize, ssize], name='screen') info = tf.placeholder(tf.float32, [None, isize], name='info') # minimap_placeholder = tf.placeholder(tf.float32, [None, 64, 64, 5]) # screen_placeholder = tf.placeholder(tf.float32, [None, 64, 64, 10]) # user_info_placeholder = tf.placeholder(tf.float32, [None, isize]) action_output = tf.placeholder(tf.float32, [None, 543]) # one hot # set up network screen_filters1 = tf.get_variable(name='sf1', shape=(5, 5, U.screen_channel(), 16)) # hwio screen_filters2 = tf.get_variable(name='sf2', shape=(3, 3, 16, 32))