def build_network(self, images, phase_train=True, nclass=1001, image_depth=3, data_type=tf.float32, data_format='NCHW', use_tf_layers=True, fp16_vars=False): """Returns logits and aux_logits from images.""" if data_format == 'NCHW': images = tf.transpose(images, [0, 3, 1, 2]) var_type = tf.float32 if data_type == tf.float16 and fp16_vars: var_type = tf.float16 network = convnet_builder.ConvNetBuilder( images, image_depth, phase_train, use_tf_layers, data_format, data_type, var_type) with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): self.add_inference(network) # Add the final fully-connected class layer logits = (network.affine(nclass, activation='linear') if not self.skip_final_affine_layer() else network.top_layer) aux_logits = None if network.aux_top_layer is not None: with network.switch_to_aux_top_layer(): aux_logits = network.affine( nclass, activation='linear', stddev=0.001) if data_type == tf.float16: # TODO(reedwm): Determine if we should do this cast here. logits = tf.cast(logits, tf.float32) if aux_logits is not None: aux_logits = tf.cast(aux_logits, tf.float32) print('Total trainable variables per GPU:{:,}'.format(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))) return logits, aux_logits
def build_network(self, inputs, phase_train=True, nclass=1001): """Returns logits from input images. Args: inputs: The input images phase_train: True during training. False during evaluation. nclass: Number of classes that the images can belong to. data_type: The dtype to run the model in: tf.float32 or tf.float16. The variable dtype is controlled by a separate parameter: self.fp16_vars. Returns: A BuildNetworkResult which contains the logits and model-specific extra information. """ images = inputs[0] if self.data_format == 'NCHW': images = tf.transpose(images, [0, 3, 1, 2]) var_type = tf.float32 if self.data_type == tf.float16 and self.fp16_vars: var_type = tf.float16 network = convnet_builder.ConvNetBuilder(images, self.depth, phase_train, self.use_tf_layers, self.data_format, self.data_type, var_type) with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): logits = self.add_inference(images, phase_train, nclass) if self.data_type == tf.float16: logits = tf.cast(logits, tf.float32) return BuildNetworkResult(logits=logits, extra_info=None)
def build_network(self, inputs, phase_train=True, nclass=1001): """Returns logits from input images. Args: inputs: The input images and labels phase_train: True during training. False during evaluation. nclass: Number of classes that the images can belong to. Returns: A BuildNetworkResult which contains the logits and model-specific extra information. """ images = inputs[0] if self.data_format == 'NCHW': images = tf.transpose(images, [0, 3, 1, 2]) images = debug.add_prob(images, name='input_image') print("input_image shape: {}".format(images.get_shape())) var_type = tf.float32 if self.data_type == tf.float16 and self.fp16_vars: var_type = tf.float16 network = convnet_builder.ConvNetBuilder(images, self.depth, phase_train, self.use_tf_layers, self.data_format, self.data_type, var_type) with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): self.add_inference(network) # Add the final fully-connected class layer logits = ( network.affine(nclass, activation='linear', name='fc_final') if not self.skip_final_affine_layer() else network.top_layer) logits = debug.add_prob(logits, name='fc_final') print("fc_final shape: {}".format(logits.get_shape())) aux_logits = None if network.aux_top_layer is not None: with network.switch_to_aux_top_layer(): aux_logits = network.affine(nclass, activation='linear', stddev=0.001) if self.data_type == tf.float16: # TODO(reedwm): Determine if we should do this cast here. logits = tf.cast(logits, tf.float32) if aux_logits is not None: aux_logits = tf.cast(aux_logits, tf.float32) return BuildNetworkResult( logits=logits, extra_info=None if aux_logits is None else aux_logits)
#with tf.device(0): images = tf.truncated_normal(image_shape, dtype=data_type, mean=127, stddev=60, name='synthetic_images') images = tf.contrib.framework.local_variable(images, name='gpu_cached_images') labels = tf.random_uniform(labels_shape, minval=0, maxval=nclass - 1, dtype=tf.int32, name='synthetic_labels') network = convnet_builder.ConvNetBuilder(images, 3, phase_train, use_tf_layers, data_format, data_type, data_type) model = vgg_model.Vgg16Model() model.add_inference(network) logits = network.affine(nclass, activation='linear') init_op = tf.initialize_all_variables() init_local_op = tf.initialize_local_variables() with tf.Session() as sess: sess.run(init_op) sess.run(init_local_op) sess.run(logits)
def add_forward_pass_and_gradients(self, phase_train, image_producer_stage): """Add ops for forward-pass and gradient computations.""" nclass = self.dataset.num_classes + 1 input_data_type = get_data_type(self.params) data_type = get_data_type(self.params) with tf.device('/gpu:0'): if not self.use_synthetic_gpu_images: images, labels = image_producer_stage.get() else: # Minor hack to avoid H2D copy when using synthetic data image_size = self.model.get_image_size() image_shape = [ self.batch_size, image_size, image_size, self.dataset.depth ] labels_shape = [self.batch_size] # Synthetic image should be within [0, 255]. images = tf.truncated_normal( image_shape, dtype=input_data_type, mean=127, stddev=60, name='synthetic_images') images = tf.contrib.framework.local_variable( images, name='gpu_cached_images') labels = tf.random_uniform( labels_shape, minval=0, maxval=nclass - 1, dtype=tf.int32, name='synthetic_labels') # Rescale from [0, 255] to [0, 2] images = tf.multiply(images, 1. / 127.5) # Rescale to [-1, 1] images = tf.subtract(images, 1.0) if self.data_format == 'NCHW': images = tf.transpose(images, [0, 3, 1, 2]) if input_data_type != data_type: images = tf.cast(images, data_type) var_type = tf.float32 network = convnet_builder.ConvNetBuilder( images, self.dataset.depth, phase_train, self.params.use_tf_layers, self.data_format, data_type, var_type) with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): self.model.add_inference(network) # Add the final fully-connected class layer logits = network.affine(nclass, activation='linear') aux_logits = None if network.aux_top_layer is not None: with network.switch_to_aux_top_layer(): aux_logits = network.affine( nclass, activation='linear', stddev=0.001) if data_type == tf.float16: # TODO(reedwm): Determine if we should do this cast here. logits = tf.cast(logits, tf.float32) if aux_logits is not None: aux_logits = tf.cast(aux_logits, tf.float32) results = {} # The return value if not phase_train or self.params.print_training_accuracy: top_1_op = tf.reduce_sum( tf.cast(tf.nn.in_top_k(logits, labels, 1), data_type)) top_5_op = tf.reduce_sum( tf.cast(tf.nn.in_top_k(logits, labels, 5), data_type)) results['top_1_op'] = top_1_op results['top_5_op'] = top_5_op if not phase_train: results['logits'] = logits return results loss = loss_function(logits, labels, aux_logits=aux_logits) params = tf.trainable_variables() l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in params]) weight_decay = self.params.weight_decay if weight_decay is not None and weight_decay != 0.: loss += weight_decay * l2_loss aggmeth = tf.AggregationMethod.DEFAULT scaled_loss = loss if self.loss_scale is None else loss * self.loss_scale grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth) if self.loss_scale is not None: # TODO(reedwm): If automatic loss scaling is not used, we could avoid # these multiplications by directly modifying the learning rate instead. # If this is done, care must be taken to ensure that this scaling method # is correct, as some optimizers square gradients and do other # operations which might not be compatible with modifying both the # gradients and the learning rate. grads = [ grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads ] param_refs = tf.trainable_variables() gradvars = list(zip(grads, param_refs)) results['loss'] = loss results['gradvars'] = gradvars return results
def add_forward_pass_and_gradients( self, host_images, host_labels, nclass, phase_train, device_num, input_data_type, data_type, input_nchan, use_synthetic_gpu_images, gpu_copy_stage_ops, gpu_compute_stage_ops, gpu_grad_stage_ops): """Add ops for forward-pass and gradient computations.""" if not use_synthetic_gpu_images: with tf.device(self.cpu_device): images_shape = host_images.get_shape() labels_shape = host_labels.get_shape() gpu_copy_stage = data_flow_ops.StagingArea( [tf.float32, tf.int32], shapes=[images_shape, labels_shape]) gpu_copy_stage_op = gpu_copy_stage.put( [host_images, host_labels]) gpu_copy_stage_ops.append(gpu_copy_stage_op) host_images, host_labels = gpu_copy_stage.get() with tf.device(self.raw_devices[device_num]): if not use_synthetic_gpu_images: gpu_compute_stage = data_flow_ops.StagingArea( [tf.float32, tf.int32], shapes=[images_shape, labels_shape] ) # The CPU-to-GPU copy is triggered here. gpu_compute_stage_op = gpu_compute_stage.put( [host_images, host_labels]) images, labels = gpu_compute_stage.get() images = tf.reshape(images, shape=images_shape) gpu_compute_stage_ops.append(gpu_compute_stage_op) else: # Minor hack to avoid H2D copy when using synthetic data images = tf.truncated_normal( host_images.get_shape(), dtype=input_data_type, stddev=1e-1, name='synthetic_images') images = tf.contrib.framework.local_variable( images, name='gpu_cached_images') labels = host_labels with tf.device(self.devices[device_num]): # Rescale from [0, 255] to [0, 2] images = tf.multiply(images, 1./127.5) # Rescale to [-1, 1] images = tf.subtract(images, 1.0) if self.data_format == 'NCHW': images = tf.transpose(images, [0, 3, 1, 2]) if input_data_type != data_type: images = tf.cast(images, data_type) network = convnet_builder.ConvNetBuilder(images, input_nchan, phase_train, self.data_format, data_type) self.model.add_inference(network) # Add the final fully-connected class layer logits = network.affine(nclass, activation='linear') aux_logits = None if network.aux_top_layer is not None: with network.switch_to_aux_top_layer(): aux_logits = network.affine(nclass, activation='linear', stddev=0.001) results = {} # The return value if not phase_train or FLAGS.print_training_accuracy: top_1_op = tf.reduce_sum( tf.cast(tf.nn.in_top_k(logits, labels, 1), data_type)) top_5_op = tf.reduce_sum( tf.cast(tf.nn.in_top_k(logits, labels, 5), data_type)) results['top_1_op'] = top_1_op results['top_5_op'] = top_5_op if not phase_train: results['logits'] = logits return results loss = loss_function(logits, labels, aux_logits=aux_logits) params = self.variable_mgr.trainable_variables_on_device(device_num) l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in params]) weight_decay = FLAGS.weight_decay if weight_decay is not None and weight_decay != 0.: loss += weight_decay * l2_loss aggmeth = tf.AggregationMethod.DEFAULT grads = tf.gradients(loss, params, aggregation_method=aggmeth) if FLAGS.staged_vars: grad_dtypes = [grad.dtype for grad in grads] grad_shapes = [grad.shape for grad in grads] grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes) grad_stage_op = grad_stage.put(grads) # In general, this decouples the computation of the gradients and # the updates of the weights. # During the pipeline warm up, this runs enough training to produce # the first set of gradients. gpu_grad_stage_ops.append(grad_stage_op) grads = grad_stage.get() param_refs = self.variable_mgr.trainable_variables_on_device( device_num, writable=True) gradvars = list(zip(grads, param_refs)) results['loss'] = loss results['gradvars'] = gradvars return results