def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype) self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = - tf.reduce_mean(action_log_prob * advantage) value_loss = - tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # TODO: policy penalty loss = policy_loss + value_loss # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype) self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1), 1e-10, 1.) non_spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected * self.valid_non_spatial_action, axis=1), 1e-10, 1.) q_value = spatial_action_prob * self.valid_spatial_action * self.ispatial + non_spatial_action_prob self.delta = self.value_target - q_value #self.clipped_error = tf.where(tf.abs(self.delta) < 1.0, 0.5 * tf.square(self.delta), tf.abs(self.delta) - 0.5, name='clipped_error') #value_loss = tf.reduce_mean(self.clipped_error, name='value_loss') value_loss = tf.reduce_mean(tf.square(self.delta)) self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # Build the optimizer self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(value_loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) grad = grad if grad is not None else tf.zeros_like(var) self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # Build a3c base networks net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype, reuse=False) self.spatial_action, self.non_spatial_action, self.value = net # Set targets and masks self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected') self.value_target = tf.placeholder(tf.float32, [None], name='value_target') # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('spatial_action_prob', spatial_action_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob)) # Compute a3closses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss', policy_loss)) self.summary.append(tf.summary.scalar('value_loss', value_loss)) # TODO: policy penalty a3c_loss = policy_loss + value_loss #pc_part_start self.pc_minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='pc_minimap') self.pc_screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='pc_screen') self.pc_info = tf.placeholder(tf.float32, [None, self.isize], name='info') self.pc_valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='pc_valid_non_spatial_action') pc_net = build_pc_net(self.pc_minimap, self.pc_screen, self.pc_info, self.msize, self.ssize, len(actions.FUNCTIONS), self.pc_valid_non_spatial_action) pc_q, pc_q_max = pc_net pc_a = tf.placeholder("float", [None, len(actions.FUNCTIONS)]) pc_a_reshaped = tf.reshape( self.pc_a, [-1, 1, 1, len(actions.FUNCTIONS)]) # Extract Q for taken action pc_qa_ = tf.multiply(self.pc_q, pc_a_reshaped) pc_qa = tf.reduce_sum(pc_qa_, reduction_indices=3, keep_dims=False) # (-1, 20, 20) # TD target for Q self.pc_r = tf.placeholder("float", [None, 20, 20]) pc_loss = self._pixel_change_lambda * tf.nn.l2_loss(self.pc_r - pc_qa) # Build the optimizer loss = pc_loss + a3c_loss self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10) grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self, reuse, dev, ntype): with tf.variable_scope(self.name) and tf.device(dev): if reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse # Set inputs of networks self.minimap = tf.placeholder( tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap') self.screen = tf.placeholder( tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen') self.info = tf.placeholder(tf.float32, [None, self.isize], name='info') # create master and subpolicies self.subpolicy_Q = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, num_units + 2, 'master_policy') # Set targets and masks for master policy update self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate') self.action_input = tf.placeholder("float", [None, num_units + 2]) self.y_input = tf.placeholder("float", [None]) self.Q_action = tf.reduce_sum(tf.multiply(self.subpolicy_Q, self.action_input), reduction_indices=1) self.cost = tf.reduce_mean(tf.square(self.y_input - self.Q_action)) self.master_train_op = tf.train.AdamOptimizer( self.learning_rate).minimize(self.cost) # Set targets and masks for subpolicies update self.valid_spatial_action = tf.placeholder( tf.float32, [None], name='valid_spatial_action_') self.spatial_action_selected = tf.placeholder( tf.float32, [None, self.ssize**2], name='spatial_action_selected') self.valid_non_spatial_action = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action_') self.non_spatial_action_selected = tf.placeholder( tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected_') self.value_target = tf.placeholder(tf.float32, [None], name='value_target_') # Build the optimizer opt = tf.train.AdamOptimizer(self.learning_rate) self.subpolicy = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), 'fcn') self.spatial_action, self.non_spatial_action, self.value = self.subpolicy # Compute log probability spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1) spatial_action_log_prob = tf.log( tf.clip_by_value(spatial_action_prob, 1e-10, 1.)) non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.non_spatial_action_selected, axis=1) valid_non_spatial_action_prob = tf.reduce_sum( self.non_spatial_action * self.valid_non_spatial_action, axis=1) valid_non_spatial_action_prob = tf.clip_by_value( valid_non_spatial_action_prob, 1e-10, 1.) non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob non_spatial_action_log_prob = tf.log( tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.)) self.summary.append( tf.summary.histogram('spatial_action_prob_', spatial_action_prob)) self.summary.append( tf.summary.histogram('non_spatial_action_prob_', non_spatial_action_prob)) # Compute losses, more details in https://arxiv.org/abs/1602.01783 # Policy loss and value loss action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob advantage = tf.stop_gradient(self.value_target - self.value) policy_loss = -tf.reduce_mean(action_log_prob * advantage) value_loss = -tf.reduce_mean(self.value * advantage) self.summary.append(tf.summary.scalar('policy_loss_', policy_loss)) self.summary.append(tf.summary.scalar('value_loss_', value_loss)) # TODO: policy penalty loss = policy_loss + value_loss grads = opt.compute_gradients(loss) cliped_grad = [] for grad, var in grads: # get around of master policy gradients if grad is None: continue self.summary.append(tf.summary.histogram(var.op.name, var)) self.summary.append( tf.summary.histogram(var.op.name + '/grad', grad)) grad = tf.clip_by_norm(grad, 10.0) cliped_grad.append([grad, var]) self.train_op = opt.apply_gradients(cliped_grad) self.summary_op = tf.summary.merge(self.summary) self.saver = tf.train.Saver(max_to_keep=100)