def apply_pruning( pruning_obj, # pylint:disable=invalid-name pruning_hparams, weight_params_fn, weight_init_obj, layerobj, wm_pc, dtype): """Apply pruning to an lingvo layer. Args: pruning_obj: a Pruning object; pruning_hparams: a Pruning hparams object; weight_params_fn: functional handle to create model parameters; weight_init_obj: a weight initialization object; layerobj: a layer object in the lingvo package; wm_pc: weight matrix; dtype: data type of the weight matrix. Returns: pruning_obj as passed in or a compression_obj. """ # Pruning options that corresponds to the pruning operations in model_pruning. if pruning_hparams.prune_option in [ 'weight', 'first_order_gradient', 'second_order_gradient' ]: mask_pc = weight_params_fn(wm_pc.shape, weight_init_obj.Constant(1.0), dtype) threshold_pc = weight_params_fn([], weight_init_obj.Constant(0.0), tf.float32) layerobj.CreateVariable('mask', mask_pc, theta_fn=None, trainable=False) layerobj.CreateVariable('threshold', threshold_pc, theta_fn=None, trainable=False) if layerobj.vars.mask not in tf.get_collection( pruning.MASK_COLLECTION): tf.add_to_collection(pruning.WEIGHT_COLLECTION, layerobj.vars.wm) tf.add_to_collection(pruning.MASK_COLLECTION, layerobj.vars.mask) tf.add_to_collection(pruning.THRESHOLD_COLLECTION, layerobj.vars.threshold) return pruning_obj else: # TODO(wanxin): add model_compression options. return pruning_obj
def recurrent(self, node, current_level, postfix, is_training): tf.add_to_collection('checkpoints', node) num_features = self.num_filters(current_level) batch_size, _, image_size = get_batch_channel_image_size( node, data_format=self.data_format) cell = self.recurrent_cell(image_size, num_features, postfix, is_training) if self.use_lstm_input_state: lstm_input_state = self.lstm_input_states[current_level] else: lstm_input_state = cell.zero_state(batch_size, tf.float32) self.lstm_input_states[current_level] = lstm_input_state node, lstm_output_state = cell(node, lstm_input_state) tf.add_to_collection('checkpoints', node) tf.add_to_collection('checkpoints', lstm_output_state) self.lstm_output_states[current_level] = lstm_output_state return node
def __init__(self, hps, net, output_layer, experiment_proto, input_paths): inputs, outputs = data.input_pipeline( input_paths, experiment_proto, hps.mbsz, hps=hps, num_threads=8) with tf.name_scope('neural_net'): logits = net.fprop(inputs, mode='train') with tf.name_scope('output_layer'): loss_per_target = output_layer.average_loss_per_target( logits, outputs, include_array=hps.train_on_array) loss = utils.reduce_nanmean(loss_per_target) self.global_step = tf.Variable(0, name='global_step', trainable=False) if hps.optimizer == 'momentum': optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum) elif hps.optimizer == 'adam': optimizer = tf.AdamOptimizer(hps.learn_rate) else: raise ValueError('invalid optimizer: %s' % hps.optimizer) optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum) grads = optimizer.compute_gradients(loss, net.params + output_layer.params) opt_op = optimizer.apply_gradients(grads, global_step=self.global_step) self.train_op = tf.with_dependencies([opt_op], loss) contrib_deprecated.scalar_summary('loss/mean', loss) for target in loss_per_target.axes['target'].labels: contrib_deprecated.scalar_summary( 'loss/' + six.ensure_str(target), lt.select(loss_per_target, {'target': target})) with tf.name_scope('summarize_grads'): slim.learning.add_gradients_summaries(grads) tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, self.global_step) tf.add_to_collection('train_op', self.train_op) tf.add_to_collection('loss', loss) self.mbsz = hps.mbsz # The log Poisson loss implemented in TensorFlow may sometimes be negative. if (hps.loss_name == output_layers.LOSS_POISSON_LOSS or hps.loss_name == output_layers.LOSS_ZERO_TRUNCATED_POISSON_LOSS): self.min_cost = -float('inf') self.min_is_inclusive = False else: self.min_cost = 0 self.min_is_inclusive = True
def _add_loss_graph(self): """Define the loss operation.""" mc = self.mc with tf.variable_scope('class_regression') as scope: # cross-entropy: q * -log(p) + (1-q) * -log(1-p) # add a small value into log to prevent blowing up self.class_loss = tf.truediv(tf.reduce_sum( (self.labels * (-tf.log(self.pred_class_probs + mc.EPSILON)) + (1 - self.labels) * (-tf.log(1 - self.pred_class_probs + mc.EPSILON))) * self.input_mask * mc.LOSS_COEF_CLASS), self.num_objects, name='class_loss') tf.add_to_collection('losses', self.class_loss) with tf.variable_scope('confidence_score_regression') as scope: input_mask = tf.reshape(self.input_mask, [mc.BATCH_SIZE, mc.ANCHORS]) self.conf_loss = tf.reduce_mean(tf.abs( tf.reduce_sum( tf.square((self.ious - self.pred_conf)) * (input_mask * mc.LOSS_COEF_CONF_POS / self.num_objects + (1 - input_mask) * mc.LOSS_COEF_CONF_NEG / (mc.ANCHORS - self.num_objects)), reduction_indices=[1])), name='confidence_loss') tf.add_to_collection('losses', self.conf_loss) tf.summary.scalar('mean iou', tf.reduce_sum(self.ious) / self.num_objects) with tf.variable_scope('bounding_box_regression') as scope: self.bbox_loss = tf.truediv(tf.reduce_sum( mc.LOSS_COEF_BBOX * tf.square(self.input_mask * (self.pred_box_delta - self.box_delta_input))), self.num_objects, name='bbox_loss') tf.add_to_collection('losses', self.bbox_loss) # add above losses as well as weight decay losses to form the total loss self.loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
def build(self): '''向量用来训练生成G''' self._z_placeholder = tf.placeholder( tf.float32, (self._batch_size, self._z_dim)) # 每行向量生成图片 '''真实图像用来训练判别器D''' self._img_placeholder = tf.placeholder( tf.float32, (self._batch_size, self._img_size, self._img_size, 1)) # [图片序号,长,宽,通道] generated_imgs = self._generator(self._z_placeholder, training=True) # G生成的图像, 假图像 fake_img_logits = self._discriminator(generated_imgs, training=True) # 假图像判断结果 real_img_logits = self._discriminator(self._img_placeholder, training=True) # 真图像判断结果 '''定义损失函数,两个,分开训练 判别器,越真越好 生成器,尽量避开判别器,让D判断为真 ''' # 生成器损失函数 loss_on_fake_to_real = tf.reduce_mean( # 真的用1表示,计算两者均值 tf.nn.sparse_softmax_cross_entropy_with_logits( # 假的图片,判断为真 labels=tf.ones([self._batch_size], dtype=tf.int64), logits=fake_img_logits)) # 判别器损失函数 loss_on_fake_to_fake = tf.reduce_mean( # 假的判断为假的 tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.zeros([self._batch_size], dtype=tf.int64), logits=fake_img_logits)) loss_on_real_to_real = tf.reduce_mean( # 真的判断为真的 tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.ones([self._batch_size], dtype=tf.int64), logits=real_img_logits)) # 总的损失函数 collection类似字典实现的 keys:value 存取后,算总的加上,这样也方面查询 tf.add_to_collection('g_losses', loss_on_fake_to_real) tf.add_to_collection('d_losses', loss_on_fake_to_fake) tf.add_to_collection('d_losses', loss_on_real_to_real) loss = { 'g': tf.add_n(tf.get_collection('g_losses'), name='total_g_loss'), 'd': tf.add_n(tf.get_collection('d_losses'), name='total_d_loss') } return self._z_placeholder, self._img_placeholder, generated_imgs, loss
def init_training_mode(): """ init_training_mode. Creates `is_training` variable and its ops if they haven't be created yet. This op is required if you are using layers such as dropout or batch normalization independently of TFLearn models (DNN or Trainer class). """ # 'is_training' collection stores the training mode variable coll = tf.get_collection('is_training') if len(coll) == 0: tr_var = variable( "is_training", dtype=tf.bool, shape=[], initializer=tf.constant_initializer(False), trainable=False) tf.add_to_collection('is_training', tr_var) # 'is_training_ops' stores the ops to update training mode variable a = tf.assign(tr_var, True) b = tf.assign(tr_var, False) tf.add_to_collection('is_training_ops', a) tf.add_to_collection('is_training_ops', b)
def apply_mask(x, scope='', prune_option='weight'): """Apply mask to a given weight tensor. Args: x: Input weight tensor scope: The current variable scope. Defaults to "". prune_option: pruning option. Defaults to 'weight'. option = 'first_order_gradient' means using |weight| * |first order gradient| for pruning. option = 'second_order_gradient' means using |weight| * |second order gradient| for pruning. Returns: Tensor representing masked_weights """ mask = pruning_utils.weight_mask_variable(x, scope) threshold = pruning_utils.weight_threshold_variable(x, scope) # Add masked_weights in the weights namescope so as to make it easier # for the quantization library to add quant ops. masked_weights = tf.multiply(mask, x, MASKED_WEIGHT_NAME) if prune_option in ('first_order_gradient', 'second_order_gradient'): # absolute value of gradients for gradient based pruning gradient = pruning_utils.weight_gradient_variable(x, scope) old_weight = pruning_utils.old_weight_variable(x, scope) old_old_weight = pruning_utils.old_old_weight_variable(x, scope) # Make sure the mask for a given variable are not added multiple times to the # collection. This is particularly important when applying mask to RNN's # weight variables if mask not in tf.get_collection_ref(MASK_COLLECTION): tf.add_to_collection(THRESHOLD_COLLECTION, threshold) tf.add_to_collection(MASK_COLLECTION, mask) tf.add_to_collection(MASKED_WEIGHT_COLLECTION, masked_weights) tf.add_to_collection(WEIGHT_COLLECTION, x) if prune_option in ('first_order_gradient', 'second_order_gradient'): tf.add_to_collection(WEIGHT_GRADIENT_COLLECTION, gradient) tf.add_to_collection(OLD_WEIGHT_COLLECTION, old_weight) tf.add_to_collection(OLD_OLD_WEIGHT_COLLECTION, old_old_weight) return masked_weights
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) # MTF setup. graph = mtf.Graph() mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) layout_rules = mtf.convert_to_layout_rules(FLAGS.layout) ctx = params["context"] num_hosts = ctx.num_hosts host_placement_fn = ctx.tpu_host_placement_function device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)] tf.logging.info("device_list = %s" % device_list, ) replica_cache_size = 300 * 1000000 # 300M per replica # Worker 0 caches all the TPU binaries. worker0_mem = replica_cache_size * ctx.num_replicas devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1) var_placer = mtf.utils.BalancedVariablePlacer(device_list, devices_memeory_usage) mesh_devices = [""] * mesh_shape.size physical_shape = list(ctx.device_assignment.topology.mesh_shape) logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu( mesh_shape.to_integer_list, physical_shape) mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl( mesh_shape, layout_rules, mesh_devices, ctx.device_assignment, logical_to_physical=logical_to_physical) mesh = mtf.Mesh(graph, "bert_mesh", var_placer) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = tf.squeeze(features["next_sentence_labels"], 1) batch_size = input_ids.get_shape()[0].value batch_dim = mtf.Dimension("batch", batch_size) seq_length = input_ids.get_shape()[1].value seq_dim = mtf.Dimension("seq", seq_length) max_predictions_per_seq = masked_lm_positions.get_shape()[1].value max_predictions_per_seq_dim = mtf.Dimension("max_pred_seq", max_predictions_per_seq) mtf_input_ids = mtf.import_tf_tensor(mesh, input_ids, [batch_dim, seq_dim]) mtf_input_mask = mtf.import_tf_tensor(mesh, input_mask, [batch_dim, seq_dim]) mtf_segment_ids = mtf.import_tf_tensor(mesh, segment_ids, [batch_dim, seq_dim]) mtf_masked_lm_positions = mtf.import_tf_tensor( mesh, masked_lm_positions, [batch_dim, max_predictions_per_seq_dim]) mtf_masked_lm_ids = mtf.import_tf_tensor( mesh, masked_lm_ids, [batch_dim, max_predictions_per_seq_dim]) mtf_masked_lm_weights = mtf.import_tf_tensor( mesh, masked_lm_weights, [batch_dim, max_predictions_per_seq_dim]) mtf_next_sentence_labels = mtf.import_tf_tensor( mesh, next_sentence_labels, [batch_dim]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = bert_lib.BertModel(config=bert_config, is_training=is_training, input_ids=mtf_input_ids, input_mask=mtf_input_mask, token_type_ids=mtf_segment_ids, layout=layout_rules, mesh_shape=mesh_shape) (masked_lm_loss, masked_lm_example_loss, masked_lm_logits) = model.get_masked_lm_output( mtf_masked_lm_positions, mtf_masked_lm_ids, mtf_masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_logits ) = model.get_next_sentence_output(mtf_next_sentence_labels) extra_loss = model.get_extra_loss() total_loss = masked_lm_loss + next_sentence_loss total_loss = mtf.anonymize(total_loss) masked_lm_example_loss = mtf.anonymize(masked_lm_example_loss) masked_lm_logits = mtf.anonymize(masked_lm_logits) next_sentence_example_loss = mtf.anonymize(next_sentence_example_loss) next_sentence_logits = mtf.anonymize(next_sentence_logits) # TRAIN mode if mode == tf.estimator.ModeKeys.TRAIN: _, update_ops = optimization_lib.create_optimizer( total_loss + extra_loss, learning_rate, num_train_steps, num_warmup_steps, optimizer=FLAGS.optimizer, clip_gradients=FLAGS.clip_gradients) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_loss = tf.to_float(lowering.export_to_tf_tensor(total_loss)) if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_global_step() tf_update_ops = [ lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append(tf.assign_add(global_step, 1)) tf.logging.info("tf_update_ops: {}".format(tf_update_ops)) train_op = tf.group(tf_update_ops) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_logits, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_logits, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_logits = tf.reshape(masked_lm_logits, [-1, masked_lm_logits.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_logits, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_logits = tf.reshape( next_sentence_logits, [-1, next_sentence_logits.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_logits, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ lowering.export_to_tf_tensor(masked_lm_example_loss), lowering.export_to_tf_tensor(masked_lm_logits), masked_lm_ids, masked_lm_weights, lowering.export_to_tf_tensor(next_sentence_example_loss), lowering.export_to_tf_tensor(next_sentence_logits), next_sentence_labels ]) with mtf.utils.outside_all_rewrites(): # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) if mode == tf.estimator.ModeKeys.TRAIN: saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( FLAGS.output_dir, save_steps=1000, saver=saver, listeners=[saver_listener]) return tf.estimator.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_hooks=[restore_hook, saver_hook]) elif mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
labels=y_, name="xentropy") loss = tf.reduce_mean(xentropy, name='loss') optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss, name="train_op") with tf.name_scope("eval"): correct = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y_, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) with tf.name_scope("init_and_save"): init_op = tf.global_variables_initializer() saver = tf.train.Saver() # We need to add a saver Op # Now we add averything we'll need in future to a collection tf.add_to_collection('train_var', train_op) tf.add_to_collection('train_var', accuracy) tf.add_to_collection('train_var', x) tf.add_to_collection('train_var', y_) n_epoch = 100 with tf.Session() as sess: sess.run(init_op) graph = tf.get_default_graph() print2(graph.get_name_scope()) for epoch in range(n_epoch): # One step of the training
def image(name, tensor, is_tpu=True): logging.info('Adding image summary {}'.format(Pair(name, tensor))) if is_tpu: tf.add_to_collection('image_summaries', Pair(name, tensor)) else: tf.summary.image(name, tensor)
def add_output_tensor_nodes(postprocessed_tensors, output_collection_name='inference_op'): """Adds output nodes for detection boxes and scores. Adds the following nodes for output tensors - * num_detections: float32 tensor of shape [batch_size]. * detection_boxes: float32 tensor of shape [batch_size, num_boxes, 4] containing detected boxes. * detection_scores: float32 tensor of shape [batch_size, num_boxes] containing scores for the detected boxes. * detection_multiclass_scores: (Optional) float32 tensor of shape [batch_size, num_boxes, num_classes_with_background] for containing class score distribution for detected boxes including background if any. * detection_features: (Optional) float32 tensor of shape [batch, num_boxes, roi_height, roi_width, depth] containing classifier features for each detected box * detection_classes: float32 tensor of shape [batch_size, num_boxes] containing class predictions for the detected boxes. * detection_keypoints: (Optional) float32 tensor of shape [batch_size, num_boxes, num_keypoints, 2] containing keypoints for each detection box. * detection_masks: (Optional) float32 tensor of shape [batch_size, num_boxes, mask_height, mask_width] containing masks for each detection box. Args: postprocessed_tensors: a dictionary containing the following fields 'detection_boxes': [batch, max_detections, 4] 'detection_scores': [batch, max_detections] 'detection_multiclass_scores': [batch, max_detections, num_classes_with_background] 'detection_features': [batch, num_boxes, roi_height, roi_width, depth] 'detection_classes': [batch, max_detections] 'detection_masks': [batch, max_detections, mask_height, mask_width] (optional). 'detection_keypoints': [batch, max_detections, num_keypoints, 2] (optional). 'num_detections': [batch] output_collection_name: Name of collection to add output tensors to. Returns: A tensor dict containing the added output tensor nodes. """ detection_fields = fields.DetectionResultFields label_id_offset = 1 boxes = postprocessed_tensors.get(detection_fields.detection_boxes) scores = postprocessed_tensors.get(detection_fields.detection_scores) multiclass_scores = postprocessed_tensors.get( detection_fields.detection_multiclass_scores) box_classifier_features = postprocessed_tensors.get( detection_fields.detection_features) raw_boxes = postprocessed_tensors.get(detection_fields.raw_detection_boxes) raw_scores = postprocessed_tensors.get( detection_fields.raw_detection_scores) classes = postprocessed_tensors.get( detection_fields.detection_classes) + label_id_offset keypoints = postprocessed_tensors.get(detection_fields.detection_keypoints) masks = postprocessed_tensors.get(detection_fields.detection_masks) num_detections = postprocessed_tensors.get(detection_fields.num_detections) outputs = {} outputs[detection_fields.detection_boxes] = tf.identity( boxes, name=detection_fields.detection_boxes) outputs[detection_fields.detection_scores] = tf.identity( scores, name=detection_fields.detection_scores) if multiclass_scores is not None: outputs[detection_fields.detection_multiclass_scores] = tf.identity( multiclass_scores, name=detection_fields.detection_multiclass_scores) if box_classifier_features is not None: outputs[detection_fields.detection_features] = tf.identity( box_classifier_features, name=detection_fields.detection_features) outputs[detection_fields.detection_classes] = tf.identity( classes, name=detection_fields.detection_classes) outputs[detection_fields.num_detections] = tf.identity( num_detections, name=detection_fields.num_detections) if raw_boxes is not None: outputs[detection_fields.raw_detection_boxes] = tf.identity( raw_boxes, name=detection_fields.raw_detection_boxes) if raw_scores is not None: outputs[detection_fields.raw_detection_scores] = tf.identity( raw_scores, name=detection_fields.raw_detection_scores) if keypoints is not None: outputs[detection_fields.detection_keypoints] = tf.identity( keypoints, name=detection_fields.detection_keypoints) if masks is not None: outputs[detection_fields.detection_masks] = tf.identity( masks, name=detection_fields.detection_masks) for output_key in outputs: tf.add_to_collection(output_collection_name, outputs[output_key]) return outputs
def _model_fn(input_fea, input_lab): """Creates a model, add summary, modes (train or eval), and hooks.""" # input_fea and input_lab should be a list (laid_out_tensors). if not isinstance(input_fea, list): input_fea = [input_fea] if not isinstance(input_lab, list): input_lab = [input_lab] def _add_summary(lowering, train_or_eval, tf_loss, scalars, global_step): """Add all summaries.""" for k in scalars.keys(): if not isinstance(scalars[k], tf.Tensor): scalars[k] = tf.cast( lowering.export_to_tf_tensor(scalars[k]), tf.float32) def _host_loss_summary(global_step, tf_loss, **scalars): """Add summary.scalar in host side.""" gs = tf.cast(global_step, tf.int64) sum_loss = contrib_summary.scalar( '{}_loss'.format(train_or_eval), tf_loss, step=gs) sum_ops = [sum_loss.op] for description, tf_metric in scalars.iteritems(): sum_metric = contrib_summary.scalar( '{}_{}'.format(train_or_eval, description), tf_metric, step=gs) sum_ops.append(sum_metric) with tf.control_dependencies(sum_ops): return tf.identity(tf_loss) if FLAGS.use_tpu: # Cast the global step to tf.int32, since # outside_compilation does not support tf.int64. tf_loss = tpu.outside_compilation( _host_loss_summary, tf.cast(global_step, tf.int32), tf_loss, **scalars) else: tf_loss = _host_loss_summary( tf.cast(global_step, tf.int32), tf_loss, **scalars) return tf_loss global_step = tf.train.get_or_create_global_step() graph, mesh, mesh_impl = mesh_context.create_graph_mesh_and_mesh_impl() with mtf.utils.outside_all_rewrites(): # Do not tpu_rewrite this part. Inside this unet, If you use Tensorflow, # instead of Mesh-Tensorflor, it will cause host to tpu send/rec. preds, loss, scalars, bn_update_ops = ( unet.unet_with_spatial_partition( mesh, mesh_impl, train_or_eval, input_fea, input_lab)) if train_or_eval == 'train': var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) lr = FLAGS.lr * tf.pow( FLAGS.lr_drop_rate, tf.floor(tf.cast(global_step, tf.float32) / FLAGS.lr_drop_steps)) scalars['learning_rate'] = lr optimizer = mtf.optimize.AdafactorOptimizer(learning_rate=lr) update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables) # This is where the actual tf graph got built. lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_update_ops = [lowering.lowered_operation(op) for op in update_ops] tf_update_ops.append(tf.assign_add(global_step, 1)) tf_update_ops.extend( [lowering.lowered_operation(op) for op in bn_update_ops]) else: # train_or_eval == 'eval': preds = [mtf.anonymize(pred) for pred in preds] # This is where the actual tf graph got built. lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_preds = [tf.cast( lowering.export_to_tf_tensor(pred), tf.float32) for pred in preds] tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32) if FLAGS.write_summary: tf_loss = _add_summary( lowering, train_or_eval, tf_loss, scalars, global_step) master_to_slice_hook = mtf.MtfRestoreHook(lowering) if train_or_eval == 'train': with mtf.utils.outside_all_rewrites(): saver = tf.train.Saver(tf.global_variables(), save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) slice_to_master_hook = tf.train.CheckpointSaverHook( FLAGS.checkpoint_dir, save_steps=FLAGS.save_checkpoints_steps, saver=saver, listeners=[saver_listener]) captured_hooks.capture([master_to_slice_hook, slice_to_master_hook]) return tf.group([tf_loss] + tf_update_ops) else: # train_or_eval == 'eval': if FLAGS.use_tpu: tf_preds.extend([tf_loss, global_step]) tf_preds_dtypes = [tf_pred.dtype for tf_pred in tf_preds] tf_preds_shapes = [tf_pred.shape for tf_pred in tf_preds] captured_hooks.capture([master_to_slice_hook, None]) captured_output_dtypes_shapes.capture( [tf_preds_dtypes, tf_preds_shapes]) return tpu_ops.outfeed_enqueue_tuple(tf_preds) else: tf_preds.extend([tf_loss, global_step]) captured_hooks.capture([master_to_slice_hook, None]) return tf_preds
def eval_op(batch, hparams, config_name): """Define a evaluation op. Args: batch: Batch produced by NSynthReader. hparams: Hyperparameters. config_name: Name of config module. Returns: eval_op: A complete evaluation op with summaries. """ phase = not (hparams.mag_only or hparams.raw_audio) config = utils.get_module("baseline.models.ae_configs.%s" % config_name) if hparams.raw_audio: x = batch["audio"] # Add height and channel dims x = tf.expand_dims(tf.expand_dims(x, 1), -1) else: x = batch["spectrogram"] # Define the model with tf.name_scope("Model"): z = config.encode(x, hparams, is_training=False) xhat = config.decode(z, batch, hparams, is_training=False) # For interpolation tf.add_to_collection("x", x) tf.add_to_collection("pitch", batch["pitch"]) tf.add_to_collection("z", z) tf.add_to_collection("xhat", xhat) total_loss = compute_mse_loss(x, xhat, hparams) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ "Loss": slim.metrics.mean(total_loss), }) # Define the summaries for name, value in names_to_values.items(): slim.summaries.add_scalar_summary(value, name, print_summary=True) # Interpolate with tf.name_scope("Interpolation"): xhat = config.decode(z, batch, hparams, reuse=True, is_training=False) # Linear interpolation z_shift_one_example = tf.concat([z[1:], z[:1]], 0) z_linear_half = (z + z_shift_one_example) / 2.0 xhat_linear_half = config.decode(z_linear_half, batch, hparams, reuse=True, is_training=False) # Pitch shift pitch_plus_2 = tf.clip_by_value(batch["pitch"] + 2, 0, 127) pitch_minus_2 = tf.clip_by_value(batch["pitch"] - 2, 0, 127) batch["pitch"] = pitch_minus_2 xhat_pitch_minus_2 = config.decode(z, batch, hparams, reuse=True, is_training=False) batch["pitch"] = pitch_plus_2 xhat_pitch_plus_2 = config.decode(z, batch, hparams, reuse=True, is_training=False) utils.specgram_summaries(x, "Training Examples", hparams, phase=phase) utils.specgram_summaries(xhat, "Reconstructions", hparams, phase=phase) utils.specgram_summaries(x - xhat, "Difference", hparams, audio=False, phase=phase) utils.specgram_summaries(xhat_linear_half, "Linear Interp. 0.5", hparams, phase=phase) utils.specgram_summaries(xhat_pitch_plus_2, "Pitch +2", hparams, phase=phase) utils.specgram_summaries(xhat_pitch_minus_2, "Pitch -2", hparams, phase=phase) return list(names_to_updates.values())
weight1 = tf.Variable(tf.truncated_normal([9, 50], stddev=0.1)) bias1 = tf.Variable(tf.constant(0.1, shape=[50])) weight2 = tf.Variable(tf.truncated_normal([50, 50], stddev=0.1)) bias2 = tf.Variable(tf.constant(0.1, shape=[50])) weight3 = tf.Variable(tf.truncated_normal([50, 1], stddev=0.1)) bias3 = tf.Variable(tf.constant(0.1, shape=[1])) sample_size = len(data) #输出y y = hidden_layer(x, weight1, bias1, weight2, bias2, weight3, bias3) #损失函数 error_loss = tf.reduce_sum(tf.pow(y_ - y, 2)) / sample_size tf.add_to_collection("losses", error_loss) #加入正则化 #regularizer = tf.contrib.layers.l2_regularizer(0.01) regularizer = tf.keras.regularizers.l2(0.001) regularization = regularizer(weight1) + regularizer(weight2) + regularizer( weight3) tf.add_to_collection("losses", regularization) loss = tf.add_n(tf.get_collection("losses")) #定义优化器 train_op = tf.train.AdamOptimizer(0.05).minimize(loss) #train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss) #定义准确率
def merge(tensors_list, mode, axis=1, name="Merge"): """ Merge. Merge a list of `Tensor` into a single one. A merging 'mode' must be specified, check below for the different options. Input: List of Tensors. Output: Merged Tensors. Arguments: tensors_list: A list of `Tensor`, A list of tensors to merge. mode: `str`. Merging mode, it supports: ``` 'concat': concatenate outputs along specified axis 'elemwise_sum': outputs element-wise sum 'elemwise_mul': outputs element-wise mul 'sum': outputs element-wise sum along specified axis 'mean': outputs element-wise average along specified axis 'prod': outputs element-wise multiplication along specified axis 'max': outputs max elements along specified axis 'min': outputs min elements along specified axis 'and': `logical and` btw outputs elements along specified axis 'or': `logical or` btw outputs elements along specified axis ``` axis: `int`. Represents the axis to use for merging mode. In most cases: 0 for concat and 1 for other modes. name: A name for this layer (optional). Default: 'Merge'. """ assert len(tensors_list) > 1, "Merge required 2 or more tensors." with tf.name_scope(name) as scope: tensors = [l for l in tensors_list] if mode == 'concat': inference = tf.concat(tensors, axis) elif mode == 'elemwise_sum': inference = tensors[0] for i in range(1, len(tensors)): inference = tf.add(inference, tensors[i]) elif mode == 'elemwise_mul': inference = tensors[0] for i in range(1, len(tensors)): inference = tf.multiply(inference, tensors[i]) elif mode == 'sum': inference = tf.reduce_sum(tf.concat(tensors, axis), reduction_indices=axis) elif mode == 'mean': inference = tf.reduce_mean(tf.concat(tensors, axis), reduction_indices=axis) elif mode == 'prod': inference = tf.reduce_prod(tf.concat(tensors, axis), reduction_indices=axis) elif mode == 'max': inference = tf.reduce_max(tf.concat(tensors, axis), reduction_indices=axis) elif mode == 'min': inference = tf.reduce_min(tf.concat(tensors, axis), reduction_indices=axis) elif mode == 'and': inference = tf.reduce_all(tf.concat(tensors, axis), reduction_indices=axis) elif mode == 'or': inference = tf.reduce_any(tf.concat(tensors, axis), reduction_indices=axis) else: raise Exception("Unknown merge mode", str(mode)) # Track output tensor. tf.add_to_collection(tf.GraphKeys.LAYER_TENSOR + '/' + name, inference) return inference
def model_fn(self, features, labels, mode, config = None, params = None): """Estimator model_fn. Args: features: This is the first item returned from the input_fn and parsed by tensorspec_utils.validate_and_pack. A spec_structure which fulfills the requirements of the self.get_feature_specification. labels: This is the second item returned from the input_fn and parsed by tensorspec_utils.validate_and_pack. A spec_structure which fulfills the requirements of the self.get_feature_specification. mode: (ModeKeys) Specifies if this is training, evaluation or prediction. config: (Optional tf.estimator.RunConfig or contrib_tpu.RunConfig) Will receive what is passed to Estimator in config parameter, or the default config (tf.estimator.RunConfig). Allows updating things in your model_fn based on configuration such as num_ps_replicas, or model_dir. params: An optional dict of hyper parameters that will be passed into input_fn and model_fn. Keys are names of parameters, values are basic python types. There are reserved keys for TPUEstimator, including 'batch_size'. Raises: ValueError: If the mode key is not supported, not in [PREDICT, TRAIN, EVAL]. Returns: An EstimatorSpec. """ features = tensorspec_utils.validate_and_pack( expected_spec=self.get_feature_specification(mode), actual_tensors_or_spec=features, ignore_batch=True) if labels: labels = tensorspec_utils.validate_and_pack( expected_spec=self.get_label_specification(mode), actual_tensors_or_spec=labels, ignore_batch=True) inference_outputs = self.inference_network_fn(features, labels, mode, config, params) update_ops = None if isinstance(inference_outputs, tuple): if len(inference_outputs) != 2: raise ValueError('Unknown output of inference_network_fn: ' 'tuple of length %d' % len(inference_outputs)) outputs = inference_outputs[0] update_ops = inference_outputs[1] inference_outputs = outputs if mode == tf.estimator.ModeKeys.PREDICT: model_fn_results = self.create_export_outputs_fn(features, inference_outputs, mode, config, params) export_outputs = None if isinstance(model_fn_results, tuple): predictions = model_fn_results[0] export_outputs = model_fn_results[1] elif isinstance(model_fn_results, dict): export_outputs = {} if len(model_fn_results) == 1: name, output = list(model_fn_results.items())[0] export_outputs[name] = tf.estimator.export.RegressionOutput(output) export_outputs[tf.saved_model.signature_constants .DEFAULT_SERVING_SIGNATURE_DEF_KEY] = ( tf.estimator.export.PredictOutput(model_fn_results)) predictions = model_fn_results else: raise ValueError('The create_export_outputs_fn should return a ' 'tuple(predictions, export_outputs) or predictions.') return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs=export_outputs) train_fn_result = self.model_train_fn(features, labels, inference_outputs, mode, config, params) if isinstance(train_fn_result, tf.Tensor): train_loss = train_fn_result train_outputs = {} elif isinstance(train_fn_result, tuple): train_loss = train_fn_result[0] train_outputs = train_fn_result[1] else: raise ValueError('The model_train_fn should return a ' 'tuple(loss, train_outputs) or loss.') if mode == tf.estimator.ModeKeys.TRAIN: # Create the tf.train.Optimizer. optimizer = self.create_optimizer() train_op = self.create_train_op(train_loss, optimizer, update_ops, train_outputs) self.add_summaries(features, labels, inference_outputs, train_loss, train_outputs, mode, config, params) # Now the optimizer has been created, therefore, the checkpoint could be # initialized. # No new variables are allowed to be added, otherwise # we would not initialize these variables. # Note, this feature is only available for train to bootstrap a model # (partially) from a different model. As soon as this checkpoint is # written all other modes will use the local checkpoint within model_dir. self.maybe_init_from_checkpoint() training_hooks = [] # EstimatorSpec has training_chief_hooks, but TPUEstimatorSpec does not, # so we have to use training_hooks here and check is_chief. if config and config.is_chief: # pytype: disable=attribute-error training_hooks.append( gin_utils.GinConfigSaverHook( config.model_dir, summarize_config=True)) if hasattr(self, 'writer_init_ops'): training_hooks.append(V2SummaryInitHook(self.writer_init_ops[mode])) # `SyncReplicasOptimizer` needs to attach a training hook. if self._sync_replicas_optimizer: training_hooks.append( self._sync_replicas_optimizer.make_session_run_hook( config.is_chief)) # pytype: disable=attribute-error # Return the value of the property first since it might be changed. scaffold_fn = self.scaffold_fn scaffold = scaffold_fn() # In order to export asynchronously the saver has to be registered # in the graph collection. The scaffold function might register a # saver already which is why it is checked here and a saver only # added it has none has been added. if not tf.get_collection(tf.GraphKeys.SAVERS): # TODO(T2R_CONTRIBUTORS): Switch to using gin config for all saver params. keep_checkpoint_every_n_hours = None max_to_keep = None if config is not None: keep_checkpoint_every_n_hours = config.keep_checkpoint_every_n_hours max_to_keep = config.keep_checkpoint_max saver = gin_configurable_saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, max_to_keep=max_to_keep, ) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) return tf.estimator.EstimatorSpec( mode=mode, loss=train_loss, train_op=train_op, training_hooks=training_hooks, scaffold=scaffold) if mode == tf.estimator.ModeKeys.EVAL: self.add_summaries(features, labels, inference_outputs, train_loss, train_outputs, mode, config, params) eval_metrics = self.model_eval_fn(features, labels, inference_outputs, train_loss, train_outputs, mode, config, params) evaluation_hooks = self.get_eval_hooks(config, params) if config and config.is_chief: # pytype: disable=attribute-error eval_name = params.get('eval_name', 'eval') # pytype: disable=attribute-error evaluation_hooks.append( gin_utils.GinConfigSaverHook( os.path.join(config.model_dir, eval_name), summarize_config=True)) if hasattr(self, 'writer_init_ops'): evaluation_hooks.append(V2SummaryInitHook(self.writer_init_ops[mode])) return tf.estimator.EstimatorSpec( mode=mode, loss=train_loss, eval_metric_ops=eval_metrics, evaluation_hooks=evaluation_hooks) raise ValueError('The mode {} is not supported yet.'.format(mode))
def create_swapping_saver_scaffold(saver=None): saver = optimizers.create_swapping_saver(optimizer) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) return tf.train.Scaffold(saver=saver)
def fully_connected(inputs, num_outputs, activation_fn=tf.nn.relu, scope=None, collection=None, distribution=NoiseDistribution.INDEPENDENT, summary_writer=None): """Creates a fully connected layer with noise.""" num_inputs = int(inputs.get_shape()[-1]) weight_shape = (num_inputs, num_outputs) biases_shape = [num_outputs] # Parameters for each noise distribution, see Section 3.2 in original paper. if distribution == NoiseDistribution.INDEPENDENT: stddev = np.sqrt(3. / num_inputs) constant = 0.017 epsilon_w = tf.truncated_normal(weight_shape) epsilon_b = tf.truncated_normal(biases_shape) elif distribution == NoiseDistribution.FACTORISED: stddev = np.sqrt(1. / num_inputs) constant = 0.5 * np.sqrt(1 / num_inputs) noise_input = tf.truncated_normal(weight_shape) noise_output = tf.truncated_normal(biases_shape) epsilon_w = tf.matmul( signed_sqrt(noise_output)[:, None], signed_sqrt(noise_input)[None, :]) epsilon_b = signed_sqrt(noise_output) else: raise ValueError('Unknown noise distribution') mu_initializer = tf.initializers.random_uniform(minval=-stddev, maxval=stddev) sigma_initializer = tf.constant_initializer(value=constant) with tf.variable_scope(scope): mu_w = tf.get_variable('mu_w', weight_shape, trainable=True, initializer=mu_initializer) sigma_w = tf.get_variable('sigma_w', weight_shape, trainable=True, initializer=sigma_initializer) mu_b = tf.get_variable('mu_b', biases_shape, trainable=True, initializer=mu_initializer) sigma_b = tf.get_variable('sigma_b', biases_shape, trainable=True, initializer=sigma_initializer) if collection is not None: tf.add_to_collection(collection, mu_w) tf.add_to_collection(collection, mu_b) tf.add_to_collection(collection, sigma_w) tf.add_to_collection(collection, sigma_b) w = mu_w + sigma_w * epsilon_w b = mu_b + sigma_b * epsilon_b layer = tf.matmul(inputs, w) layer_bias = tf.nn.bias_add(layer, b) if summary_writer is not None: with tf.variable_scope('Noisy'): tf.summary.scalar('Sigma', tf.reduce_mean(sigma_w)) if activation_fn is not None: layer_bias = activation_fn(layer_bias) return layer_bias
def get_customized_apply_compression_op(self, a_matrix_tfvar, matrix_compressor, layer_obj, weight_params_fn, weight_init_obj, scope='default_scope'): """Returns pruning + kmeans compressed operator for a customized layer. Args: a_matrix_tfvar: TF variable representing a tensor variable in a model. matrix_compressor: MatrixCompressorInferface object to specify the compression algorithm. Must return two matrices b_matrix,c_matrix in its compression. layer_obj: a customeried layer object that handles variable creation. weight_params_fn: functional handle to create model parameters. weight_init_obj: a weight initialization object. scope: TF scope used for creating new TF variables. Returns: A TF node that has the compressed version of a_matrix_tfvar. """ self.matrix_compressor = matrix_compressor a_matrix = np.zeros(shape=a_matrix_tfvar.shape) if getattr(self._spec, 'do_transpose', False): a_matrix = np.transpose(a_matrix) [b_matrix, c_matrix] = matrix_compressor.static_matrix_compressor(a_matrix) self.uncompressed_size = matrix_compressor.uncompressed_size self.compressed_size = matrix_compressor.compressed_size p = layer_obj.params with tf.variable_scope(scope) as scope: # Create pruning relevant variables. mask_pc = weight_params_fn(a_matrix.shape, weight_init_obj.Constant(1.0), p.dtype) threshold_pc = weight_params_fn([], weight_init_obj.Constant(0.0), tf.float32) self._create_layer_variable(layer_obj, 'mask', mask_pc, None, False) self._create_layer_variable(layer_obj, 'threshold', threshold_pc, None, False) if layer_obj.vars.mask not in tf.get_collection(pruning.MASK_COLLECTION): tf.add_to_collection(pruning.WEIGHT_COLLECTION, layer_obj.vars.wm) tf.add_to_collection(pruning.MASK_COLLECTION, layer_obj.vars.mask) tf.add_to_collection(pruning.THRESHOLD_COLLECTION, layer_obj.vars.threshold) if self.pruning_obj.get_spec().prune_option in [ 'first_order_gradient', 'second_order_gradient' ]: grad_pc = weight_params_fn(a_matrix.shape, weight_init_obj.Constant(0.0), p.dtype) self._create_layer_variable(layer_obj, 'gradient', grad_pc, None, False) self._create_layer_variable(layer_obj, 'old_weight', grad_pc, None, False) self._create_layer_variable(layer_obj, 'old_old_weight', grad_pc, None, False) tf.add_to_collection(pruning.WEIGHT_GRADIENT_COLLECTION, layer_obj.vars.gradient) tf.add_to_collection(pruning.OLD_WEIGHT_COLLECTION, layer_obj.vars.old_weight) tf.add_to_collection(pruning.OLD_OLD_WEIGHT_COLLECTION, layer_obj.vars.old_old_weight) b_matrix_pc = weight_params_fn(b_matrix.shape, weight_init_obj.Constant(1.0), p.dtype) c_matrix_pc = weight_params_fn(c_matrix.shape, weight_init_obj.Constant(1), tf.int32) alpha_pc = weight_params_fn([], weight_init_obj.Constant(1.0), tf.float32) self._create_layer_variable(layer_obj, 'alpha', alpha_pc, None, False) self._create_layer_variable( layer_obj, 'b_matrix_tfvar', b_matrix_pc, None, trainable=self.matrix_compressor.get_spec().is_b_matrix_trainable) self._create_layer_variable( layer_obj, 'c_matrix_tfvar', c_matrix_pc, None, trainable=self.matrix_compressor.get_spec().is_c_matrix_trainable) self.b_matrix_tfvar = layer_obj.vars.b_matrix_tfvar self.c_matrix_tfvar = layer_obj.vars.c_matrix_tfvar self.alpha = layer_obj.vars.alpha self.a_matrix_tfvar = a_matrix_tfvar self.mask = layer_obj.vars.mask self.threshold = layer_obj.vars.threshold self.pruned_a_matrix_tfvar = tf.multiply(layer_obj.vars.wm, layer_obj.vars.mask, 'masked_weight') def maybe_apply_compression(): """Decide whether global step is within compression range. Returns: is_step_within_compression_range: bool. """ with tf.compat.v1.name_scope(self._spec.name): # Compress if current step is more than begin_compression_step and # less than end_compression_step (unless it's negative) global_step = tf.train.get_global_step() def real_global_step_fn(): return tf.cast(tf.train.get_global_step(), tf.int32) def mock_global_step_fn(): return self._spec.begin_compression_step def is_global_step_none(global_step): return tf.constant(global_step is None, dtype=tf.bool) global_step = tf.cond(is_global_step_none(global_step), mock_global_step_fn, real_global_step_fn) is_step_within_compression_range = tf.logical_and( tf.greater_equal( tf.cast(global_step, tf.int32), self._spec.begin_compression_step), tf.logical_or( tf.less_equal( tf.cast(global_step, tf.int32), self._spec.end_compression_step), tf.less(self._spec.end_compression_step, 0))) return is_step_within_compression_range if getattr(self._spec, 'do_transpose', False): self.pruning_and_compression_op = ( self.alpha * self.pruned_a_matrix_tfvar + (1 - self.alpha) * tf.math.multiply( tf.transpose( tf.reshape( tf.nn.embedding_lookup(self.b_matrix_tfvar, self.c_matrix_tfvar), tf.transpose(a_matrix_tfvar).shape)), self.mask, name='pruned_compressed_weight')) else: self.pruning_and_compression_op = ( self.alpha * self.pruned_a_matrix_tfvar + (1 - self.alpha) * tf.math.multiply( tf.reshape( tf.nn.embedding_lookup(self.b_matrix_tfvar, self.c_matrix_tfvar), a_matrix_tfvar.shape), self.mask, name='pruned_compressed_weight')) def pruned_a_matrix_fn(): return self.pruned_a_matrix_tfvar def quantized_pruned_a_matrix_fn(): return self.pruning_and_compression_op self.final_op = tf.cond(maybe_apply_compression(), quantized_pruned_a_matrix_fn, pruned_a_matrix_fn) self.add_compression_summaries() self.pruning_obj.add_pruning_summaries() self.update_op = tf.no_op() return [self.final_op, self.update_op]
def __call__(self, x, training, distname='batch_normalization'): shape = [x.shape[-1]] with tf.variable_scope('batch_normalization'): ones = tf.initializers.ones() zeros = tf.initializers.zeros() gamma = tf.get_variable('gamma', shape, initializer=ones, trainable=True, use_resource=True) beta = tf.get_variable('beta', shape, initializer=zeros, trainable=True, use_resource=True) moving_mean = tf.get_variable('moving_mean', shape, initializer=zeros, trainable=False, use_resource=True) moving_variance = tf.get_variable('moving_variance', shape, initializer=ones, trainable=False, use_resource=True) num_replicas = FLAGS.num_replicas x = tf.cast(x, tf.float32) if training: if num_replicas <= 8: group_assign = None group_shards = tf.cast(num_replicas, tf.float32) else: group_shards = max( 1, int(FLAGS.batch_norm_batch_size / (FLAGS.train_batch_size / num_replicas))) group_assign = np.arange(num_replicas, dtype=np.int32) group_assign = group_assign.reshape([-1, group_shards]) group_assign = group_assign.tolist() group_shards = tf.cast(group_shards, tf.float32) mean = tf.reduce_mean(x, [0, 1, 2]) mean = tf.tpu.cross_replica_sum(mean, group_assign) / group_shards # Var[x] = E[x^2] - E[x]^2 mean_sq = tf.reduce_mean(tf.math.square(x), [0, 1, 2]) mean_sq = tf.tpu.cross_replica_sum(mean_sq, group_assign) / group_shards variance = mean_sq - tf.math.square(mean) decay = tf.cast(1. - self.momentum, tf.float32) def u(moving, normal, name): num_replicas_fp = tf.cast(num_replicas, tf.float32) normal = tf.tpu.cross_replica_sum(normal) / num_replicas_fp diff = decay * (moving - normal) return tf.assign_sub(moving, diff, use_locking=True, name=name) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, u(moving_mean, mean, name='moving_mean')) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, u(moving_variance, variance, name='moving_variance')) x = tf.nn.batch_normalization(x, mean=mean, variance=variance, offset=beta, scale=gamma, variance_epsilon=self.epsilon) else: x, _, _ = tf.nn.fused_batch_norm(x, scale=gamma, offset=beta, mean=moving_mean, variance=moving_variance, epsilon=self.epsilon, is_training=False) return x
def call(self, inputs, training=None): outputs = super().call(inputs, training) # A temporary hack for tf1 compatibility with keras batch norm. for u in self.updates: tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, u) return outputs
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, magenta.music.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob if hparams.use_cudnn: outputs, initial_state, final_state = make_cudnn( expanded_inputs, hparams.rnn_layer_sizes, hparams.batch_size, mode, dropout_keep_prob=dropout_keep_prob, residual_connections=hparams.residual_connections) else: cell = make_rnn_cell( hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn( cell, inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = contrib_layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[ :, logits_offsets[i]:logits_offsets[i + 1]])) predictions.append( tf.argmax(logits_flat[ :, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float(tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float(tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some models this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps(labels[:length]) return np.float32(num_steps) num_steps = tf.py_func( batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum(softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate) train_op = contrib_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map({ 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy(labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean( tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in six.iteritems(vars_to_summarize): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape( softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf_nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf_nest.flatten(final_state): tf.add_to_collection('final_state', state)
def module_fn(is_training): """Module function.""" input_ids = tf.placeholder(tf.int32, [None, None], "input_ids") input_mask = tf.placeholder(tf.int32, [None, None], "input_mask") segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids") mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions") albert_config_path = os.path.join( FLAGS.albert_directory, "albert_config.json") albert_config = modeling.AlbertConfig.from_json_file(albert_config_path) model = modeling.AlbertModel( config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False, use_einsum=FLAGS.use_einsum) mlm_logits = get_mlm_logits(model, albert_config, mlm_positions) vocab_model_path = os.path.join(FLAGS.albert_directory, "30k-clean.model") vocab_file_path = os.path.join(FLAGS.albert_directory, "30k-clean.vocab") config_file = tf.constant( value=albert_config_path, dtype=tf.string, name="config_file") vocab_model = tf.constant( value=vocab_model_path, dtype=tf.string, name="vocab_model") # This is only for visualization purpose. vocab_file = tf.constant( value=vocab_file_path, dtype=tf.string, name="vocab_file") # By adding `config_file, vocab_model and vocab_file` # to the ASSET_FILEPATHS collection, TF-Hub will # rewrite this tensor so that this asset is portable. tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_model) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) hub.add_signature( name="tokens", inputs=dict( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids), outputs=dict( sequence_output=model.get_sequence_output(), pooled_output=model.get_pooled_output())) hub.add_signature( name="mlm", inputs=dict( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, mlm_positions=mlm_positions), outputs=dict( sequence_output=model.get_sequence_output(), pooled_output=model.get_pooled_output(), mlm_logits=mlm_logits)) hub.add_signature( name="tokenization_info", inputs={}, outputs=dict( vocab_file=vocab_model, do_lower_case=tf.constant(FLAGS.do_lower_case)))
def model_fn(features, labels, mode, params): # Get global step global_step = tf.train.get_global_step() # Construct mtf graph + mesh from params graph = mtf.Graph() mesh_shape = mtf.convert_to_shape(params["mesh_shape"]) layout_rules = mtf.convert_to_layout_rules(params["layout"]) # Mesh setup if params["use_tpu"]: var_placer, mesh_impl = simd_mesh_setup(params, mesh_shape, layout_rules) else: var_placer = None gpu_ids = params["gpu_ids"] mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, gpu_ids) # Trainable variable precision # Store to checkpoints in master type, train in slice type, compute in activation type if params["precision"] == "bfloat16": variable_dtype = mtf.VariableDType(master_dtype=tf.bfloat16, slice_dtype=tf.float32, activation_dtype=tf.bfloat16) else: variable_dtype = mtf.VariableDType(master_dtype=tf.float32, slice_dtype=tf.float32, activation_dtype=tf.float32) # Build mtf mesh object mesh = mtf.Mesh(graph, "my_mesh", var_placer) # Build mtf_features & seq length dict for getting number of microbatches # We need to pack inputs into a dict to pass into serialize_training_step features_dict = {"inputs": features, "labels": labels} sequence_length_dict = { "inputs": params["n_ctx"], "labels": params["n_ctx"] } params = add_mode_to_params(params, mode) batch_size = get_batch_size(params) batch_dim = mtf.Dimension("batch", batch_size) batch_dims = [batch_dim] feature_length = sequence_length_dict["inputs"] length_dim = mtf.Dimension("sequence", feature_length) mtf_features = {} for key, x in features_dict.items(): if x is not None: feature_shape = mtf.Shape(batch_dims + [length_dim]) if type(features_dict[key]) == dict: features_dict[key] = features_dict[key]["feature"] x = tf.cast(features_dict[key], tf.int32) x = tf.reshape(x, feature_shape.to_integer_list) mtf_features[key] = mtf.import_fully_replicated(mesh, x, feature_shape, name=key) # Instantiate dict for dimensions, bias, etc that can be calculated here once then passed into model other_features = {} memory_length_dim = mtf.Dimension("memory_length", length_dim.size) attn_bias = biasmask_attn_weights( mesh, length_dim, memory_length_dim, variable_dtype) if params["causal"] else None # Add attn_bias into mtf_features other_features["attn_bias"] = attn_bias # Define other Dimensions that we'll need inside the model embd_dim = mtf.Dimension("embd", params["n_embd"]) vocab_dim = mtf.Dimension("vocab", params["n_vocab"]) # We need this because gathering when both the args have the same dimension in them breaks things # This dim is specifically for the weights # This prevents the "Einsum has lhs dimension without corresponding rhs or output dimension." error embed_sequence_dim = mtf.Dimension("embed_sequence", params["n_ctx"]) other_features["embd_dim"] = embd_dim other_features["vocab_dim"] = vocab_dim other_features["embed_sequence_dim"] = embed_sequence_dim other_features["memory_length_dim"] = memory_length_dim if mode == tf.estimator.ModeKeys.PREDICT: # Set up the model for prediction inputs = mtf_features["inputs"] if params["remove_partial_sequences"] is None: params["remove_partial_sequences"] = False export = params.get("export", False) if not export: mtf_samples = sample_autoregressive( inputs, other_features=other_features, params=params, variable_dtype=variable_dtype, remove_partial_sequences=params["remove_partial_sequences"], stop_at_token=params["eos_id"], sampling_use_entmax=params['sampling_use_entmax']) else: with mtf.utils.outside_all_rewrites(): with tf.variable_scope('gpt2'): mtf_samples, loss, loss_batch = gpt2.model( mtf_features, other_features, params, mesh, variable_dtype=variable_dtype, context=None) mtf_samples = mtf.anonymize(mtf_samples) inputs = mtf.anonymize(inputs) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True) inputs = lowering.export_to_tf_tensor(inputs) outputs = lowering.export_to_tf_tensor(mtf_samples) predictions = {"inputs": inputs, "outputs": outputs} def scaffold_fn(): return tf.train.Scaffold( local_init_op=tf.group( tf.train.Scaffold.default_local_init_op(), lowering.copy_masters_to_slices(), name="mtf_local_init_op"), ready_op=tf.concat([ tf.report_uninitialized_variables(), resources.report_uninitialized_resources() ], axis=0, name="mtf_ready_op")) return tpu_estimator.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions, scaffold_fn=scaffold_fn, prediction_hooks=[mtf.MtfRestoreHook(lowering)]) # We're not predicting, so we better be training or evaluating assert (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL) if mode == tf.estimator.ModeKeys.TRAIN: # Gets number of microbatches per batch for serialized training # if param tokens_per_mb_per_replica = None, this defaults to 1 and no microbatching is performed num_microbatches = int( mtf_transformer.utils.serialize_num_microbatches( batch_dim=batch_dim, sequence_length=sequence_length_dict, mesh_shape=mesh_shape, layout_rules=layout_rules, tokens_per_microbatch_per_replica=params[ "tokens_per_mb_per_replica"])) else: num_microbatches = 1 params[ "num_microbatches"] = num_microbatches # Add num microbatches to params if num_microbatches > 1: # For serialize_training_step we need to modify the model to output results in a dict def serialized_fn(mtf_features): if params["model"] == "GPT": with tf.variable_scope('gpt2'): logits, loss, loss_batch = gpt2.model( mtf_features, other_features, params, mesh, variable_dtype=variable_dtype) return { "logits": logits, "loss": loss, "loss_batch": loss_batch } else: raise Exception( f"'{params['model']}' is not a valid model - please select from [GPT]" ) # Serialize the training step - Gradients are accumulated locally and reduced once. var_grads, output_dict = mtf.serialize_training_step( mtf_features, serialized_fn, batch_dim, num_microbatches) loss = output_dict["loss"] loss_batch = output_dict["loss_batch"] logits = output_dict["logits"] else: # If we're not splitting into microbatches, return logits & loss as is if params["model"] == "GPT": with mtf.utils.outside_all_rewrites(): with tf.variable_scope('gpt2'): logits, loss, loss_batch = gpt2.model( mtf_features, other_features, params, mesh, variable_dtype=variable_dtype, context=None) else: raise Exception( f"'{params['model']}' is not a valid model - please select from [GPT]" ) # Auto layout generation if params["auto_layout"]: auto_layout(graph, mesh_shape, logits, loss) if params["auto_layout_and_mesh_shape"]: auto_layout_and_mesh_shape(graph, params["num_cores"], logits, loss) if mode == tf.estimator.ModeKeys.TRAIN: # In TRAIN mode, get optimizer if params["num_microbatches"] > 1: # If we are splitting the batch into microbatches, var grads are created in the serialize_training_step fn # So we pass them in here _, update_ops, var_grads = get_optimizer( mesh, loss, params, variable_dtype=variable_dtype, inp_var_grads=var_grads) else: # Otherwise, they are created in the get_optimizer fn, so we leave inp_var_grads blank _, update_ops, var_grads = get_optimizer( mesh, loss, params, variable_dtype=variable_dtype) # Log summaries to tensorboard mtf.scalar_summary("loss", loss) # Log gradients if in params if params["log_grads"] not in [None, False]: for g in var_grads: grad_norm = mtf.sqrt(mtf.reduce_sum(mtf.square(g))) mtf.scalar_summary("grads/norm" + g.name[:-2], grad_norm) else: # For now, we can only export fully-replicated tensors. # This has to be done before lowering or they will not be included in the graph mean_logits = mtf.reduce_mean(logits, reduced_dim=vocab_dim) max_logits = mtf.argmax(logits, vocab_dim) del logits fully_replicated_mean_logits = mtf.anonymize(mean_logits) fully_replicated_max_logits = mtf.anonymize(max_logits) fully_replicated_loss_batch = mtf.anonymize(loss_batch) # Gets & prints info about no. trainable vars in the model & dimension names get_graph_info(graph) # 'lowers' mtf tensors into a tf graph - this enables us to export results as tf tensors lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True) tf_loss = lowering.export_to_tf_tensor(loss) tf_loss = tf.cast(tf_loss, tf.float32) if mode == tf.estimator.ModeKeys.TRAIN: # Use our patched version until mtf updates theirs host_call = create_host_call(params['model_path']) mtf.utils.remove_summaries() # Creates train_op tf_update_ops = [lowering.lowered_operation(op) for op in update_ops] tf_update_ops.append(tf.assign_add( global_step, 1)) # Need to manually increment global_step tf.logging.info(f"tf_update_ops: {tf_update_ops}") train_op = tf.group(tf_update_ops) else: tf_mean_logits = lowering.export_to_tf_tensor( fully_replicated_mean_logits) tf_max_logits = lowering.export_to_tf_tensor( fully_replicated_max_logits) tf_loss_batch = tf.to_float( lowering.export_to_tf_tensor(fully_replicated_loss_batch)) with mtf.utils.outside_all_rewrites(): # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) if mode == tf.estimator.ModeKeys.TRAIN: # Set up the checkpoint server and return the TPUEstimatorSpec saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( params["model_path"], save_steps=params["steps_per_checkpoint"], saver=saver, listeners=[saver_listener]) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, host_call=host_call, train_op=train_op, training_hooks=[restore_hook, saver_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # Evaluation metrics def _perplexity(loss): perplexity = tf.exp(loss) return tf.metrics.mean(perplexity) def _bits_per_byte(loss): bpb = loss * (0.29335 / math.log(2)) return tf.metrics.mean(bpb) def _metric_fn(tf_mean_logits, tf_loss_batch): mean_logits = tf.metrics.mean(tf_mean_logits) loss = tf.reduce_mean(tf_loss_batch) perp = _perplexity(loss) bpb = _bits_per_byte(loss) return { "mean_logits": mean_logits, "perplexity": perp, "bits per byte": bpb } def _lambada_metric_fn(labels, tf_max_logits, tf_loss_batch): eos_token = params["eos_id"] answer_positions = tf.where( tf.math.not_equal(labels, eos_token)) correct_answers = tf.gather_nd( tf.math.equal(tf_max_logits, labels), answer_positions) accuracy = tf.metrics.mean(tf.cast(correct_answers, tf.float32)) # I guess tf_loss_batch has z_loss and maybe other stuff added to it # so maybe this should be calculated separately in the future answer_loss = tf.gather_nd(tf_loss_batch, answer_positions) log_perplexity = tf.metrics.mean(answer_loss) return { "lambada_acc": accuracy, "lambada_log_ppl": log_perplexity } eval_task = params["eval_task"] if eval_task == "lambada": eval_metrics = (_lambada_metric_fn, [labels, tf_max_logits, tf_loss_batch]) else: eval_metrics = (_metric_fn, [tf_mean_logits, tf_loss_batch]) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
def input_fn(dataset, filepattern, skip_random_fraction_when_training, batch_size_means_tokens_param, batch_size_multiplier, max_length, mode, hparams, data_dir=None, params=None, config=None, force_repeat=False, prevent_repeat=False): """Builds input pipeline for problem. Args: dataset: the dataset to make input function from. filepattern: the pattern of files to read from. skip_random_fraction_when_training: whether to skip randomly when training. batch_size_means_tokens_param: whether batch size should mean tokens. batch_size_multiplier: how to multiply batch size when bucketing. max_length: maximum length, mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU force_repeat: bool, whether to repeat the data even if not training prevent_repeat: bool, whether to not repeat when in training mode. Overrides force_repeat. Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = cpu_count() if is_training else 1 if config and hasattr(config, "data_parallelism") and config.data_parallelism: num_shards = config.data_parallelism.n else: num_shards = 1 if hasattr(hparams, 'deterministic_dataset') and hparams.deterministic_dataset: num_threads = 1 skip_random_fraction_when_training = False hparams.batch_shuffle_size = 0 def tpu_valid_size(example): return example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training max_validate_length = max_length if drop_long_sequences else 10**9 return example_valid_size(example, hparams.min_length, max_validate_length) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) if is_training and hparams.use_horovod: dataset = dataset.shard(num_shards=hparams.hvd_size, index=hparams.hvd_worker_id) if (force_repeat or is_training) and not prevent_repeat: # Repeat and skip a random number of records dataset = dataset.repeat() if is_training and skip_random_fraction_when_training: data_files = contrib.slim().parallel_reader.get_data_files(filepattern) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map(cast_ints_to_int32, num_parallel_calls=num_threads) if batch_size_means_tokens_param: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens." ) batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = pad_for_tpu(dataset.output_shapes, hparams, max_length) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] if hparams.pad_batch: tf.logging.warn( "Padding the batch to ensure that remainder eval batches are " "processed. This may lead to incorrect metrics for " "non-zero-padded features, e.g. images. Use a smaller batch " "size that has no remainder in that case.") dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=False) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_size), num_parallel_calls=num_threads) else: dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=True) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) cur_batching_scheme = hparams_to_batching_scheme( hparams, shard_multiplier=num_shards, length_multiplier=batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. cur_batching_scheme["batch_sizes"] = [hparams.batch_size] cur_batching_scheme["boundaries"] = [] force_fixed_batch_size = hparams.use_static_shapes fixed_batch_size = hparams.batch_size // hparams.max_length if force_fixed_batch_size: cur_batching_scheme["batch_sizes"] = [ fixed_batch_size, fixed_batch_size ] cur_batching_scheme["boundaries"] = [hparams.max_length + 1] dataset = dataset.apply( tf.data.experimental.bucket_by_sequence_length( example_length, cur_batching_scheme["boundaries"], cur_batching_scheme["batch_sizes"], pad_to_bucket_boundary=force_fixed_batch_size)) if force_fixed_batch_size and is_training: def _force_shape(example): for _, t in six.iteritems(example): shape = t.get_shape().as_list() shape[0] = fixed_batch_size shape[1] = shape[1] or hparams.max_length t.set_shape(t.get_shape().merge_with(shape)) # Assert shapes are fully known t.get_shape().assert_is_fully_defined() return example dataset = dataset.map(_force_shape, num_parallel_calls=num_threads) if not is_training: batch_multiple = num_shards if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case." ) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) # Add shuffling for training batches. This is necessary along with record # level shuffling in the dataset generation. Record shuffling will shuffle # the examples. However, in some cases, it's possible that the shuffle # buffer size for record shuffling is smaller than the batch size. In such # cases, adding batch shuffling ensures that the data is in random order # during training if (is_training and hasattr(hparams, "batch_shuffle_size") and hparams.batch_shuffle_size): dataset = dataset.shuffle(hparams.batch_shuffle_size) # Split batches into chunks if targets are too long. # The new "chunk_number" feature is 0 for the first chunk and goes up then. # Chunks are reversed so the 0th chunk comes first, then the 1st and so on, # so models can attend to them in the order they arrive. The last chunk is # usually the one containing the end of the target sentence (EOS). chunk_length = hparams.get("split_targets_chunk_length", 0) max_chunks = hparams.get("split_targets_max_chunks", 100) if chunk_length > 0: def is_nonzero_chunk(example): """A chunk is zero if all targets are 0s.""" return tf.less(0, tf.reduce_sum(tf.abs(example["targets"]))) def split_on_length(example): """Split a batch of ditcs on length.""" x = example["targets"] # TODO(kitaev): This code breaks if chunk_length * max_chunks < batch_size length_diff = chunk_length * max_chunks - tf.shape(x)[1] padded_x = tf.pad(x, [(0, 0), (0, length_diff), (0, 0), (0, 0)]) chunks = [ padded_x[:, i * chunk_length:(i + 1) * chunk_length, :, :] for i in range(max_chunks - 1) ] chunks.append(padded_x[:, (max_chunks - 1) * chunk_length:, :, :]) new_example = {} # Setting chunk_number to be tf.range(max_chunks) is incompatible with TPU new_example["chunk_number"] = tf.concat([ tf.expand_dims(tf.ones_like(c) * n, axis=0) for n, c in enumerate(chunks) ], axis=0) new_example["targets"] = tf.concat( [tf.expand_dims(c, axis=0) for c in chunks], axis=0) for k in example: if k != "targets": assert k != "chunk_number", ( "Chunking code expects the chunk_number feature name to be " "available") new_example[k] = tf.concat([ tf.expand_dims(example[k], axis=0) for _ in range(max_chunks) ], axis=0) return tf.data.Dataset.from_tensor_slices(new_example) dataset = dataset.flat_map(split_on_length) dataset = dataset.filter(is_nonzero_chunk) # The chunking data pipeline thus far creates batches of examples where all # of the examples have the same chunk number. This can lead to periodic # fluctuations in the loss; for example, when all examples in the batch have # chunk number 0 the loss may be higher than midway through a sequence. # Enabling split_targets_strided_training adjusts the data so that each # batch includes examples at various points within a sequence. if is_training and hparams.split_targets_strided_training: # TODO(kitaev): make sure that shape inference works on GPU, not just TPU. inferred_batch_size = dataset.output_shapes["targets"].as_list()[0] if inferred_batch_size is None: raise ValueError( "Strided training is only implemented when the batch size can be " "inferred statically, for example when training on TPU.") chunk_stride = inferred_batch_size * max( 1, max_chunks // inferred_batch_size) + 1 def collapse_nested_datasets(example): """Converts a dataset of datasets to a dataset of tensor features.""" new_example = {} for k, v in example.items(): v = tf.data.experimental.get_single_element( v.batch(inferred_batch_size, drop_remainder=True)) new_example[k] = v return tf.data.Dataset.from_tensor_slices(new_example) dataset = dataset.unbatch() dataset = dataset.window(inferred_batch_size, inferred_batch_size, chunk_stride) dataset = dataset.flat_map(collapse_nested_datasets) dataset = dataset.batch(inferred_batch_size, drop_remainder=True) def prepare_for_output(example): if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example[hparams.get(key="labels_feature_name", default="targets")] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner()) return dataset
def get_variable(name, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, validate_shape=True): """Returns TensorTrain object with tf.Variables as the TT-cores. Args: name: The name of the new or existing TensorTrain variable. Used to name the TT-cores. dtype: Type of the new or existing TensorTrain variable TT-cores (defaults to DT_FLOAT). initializer: TensorTrain or TensorTrainBatch, initializer for the variable if one is created. regularizer: A (TensorTrain -> Tensor or None) function; the result of applying it on a newly created variable will be added to the collection GraphKeys.REGULARIZATION_LOSSES and can be used for regularization. trainable: If True also add the variable to the graph collection GraphKeys.TRAINABLE_VARIABLES (see tf.Variable). collections: List of graph collections keys to add the Variables (underlying TT-cores). Defaults to [GraphKeys.GLOBAL_VARIABLES] (see tf.Variable). caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not None, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through Switch and other conditional statements. validate_shape: If False, allows the variable to be initialized with a value of unknown shape. If True, the default, the shape of initial_value must be known. Returns: The created or existing `TensorTrain` object with tf.Variables TT-cores. Raises: `ValueError`: when creating a new variable and shape is not declared, when violating reuse during variable creation, or when initializer dtype and dtype don't match. Reuse is set inside variable_scope. """ # TODO: support validate shape: check that the tensor dimensions are correct, # but ignore the ranks. # TODO: add validate ranks flag. reuse = tf.get_variable_scope().reuse if not reuse and initializer is None: raise ValueError( 'Scope reuse is False and initializer is not provided.') variable_cores = [] if reuse and not utils.in_eager_mode(): # Find an existing variable in the collection. path = tf.get_variable_scope().name if path != '' and path[-1] != '/': path += '/' path += name found_v = None for v in tf.get_collection('TensorTrainVariables'): if v.name == path: found_v = v break if found_v is None: raise ValueError( 'ValueError: Variable %s does not exist, or was not ' 'created with t3f.get_tt_variable(). Did you mean to ' 'set reuse=None in VarScope?' % name) with tf.variable_scope(name): # Try to get the first core through tf.get_variable to check that we don't # violate reuse: it will raise a ValueError otherwise. tf.get_variable('core_0', dtype=dtype) return found_v else: # Create new variable. with tf.variable_scope(name): num_dims = initializer.ndims() for i in range(num_dims): curr_core_var = tf.get_variable( 'core_%d' % i, initializer=initializer.tt_cores[i], dtype=dtype, trainable=trainable, collections=collections, caching_device=caching_device) variable_cores.append(curr_core_var) if isinstance(initializer, TensorTrain): v = TensorTrain(variable_cores, initializer.get_raw_shape(), initializer.get_tt_ranks(), convert_to_tensors=False) else: v = TensorTrainBatch(variable_cores, initializer.get_raw_shape(), initializer.get_tt_ranks(), initializer.batch_size, convert_to_tensors=False) # Add the create TensorTrain object into a collection so that we can # retrieve it in the future by get_tt_variable('name'). tf.add_to_collection('TensorTrainVariables', v) # Run the regularizer if requested and save the resulting loss. if regularizer: with tf.name_scope(name + "/Regularizer/"): loss = regularizer(v) if loss is not None: tf.logging.vlog( 1, "Applied regularizer to %s and added the result %s " "to REGULARIZATION_LOSSES.", v.name, loss.name) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, loss) return v
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) # Set policy for mixed-precision training with Keras-based models. if use_tpu and train_config.use_bfloat16: from tensorflow.python.keras.engine import base_layer_utils # pylint: disable=g-import-not-at-top # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0. base_layer_utils.enable_v2_dtype_behavior() tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): provide_groundtruth(detection_model, labels) preprocessed_images = features[fields.InputDataFields.image] side_inputs = detection_model.get_side_inputs(features) if use_tpu and train_config.use_bfloat16: with tf.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) prediction_dict = ops.bfloat16_to_float32_nested( prediction_dict) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = tf.tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper( (prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: load_pretrained = hparams.load_pretrained if hparams else False if train_config.fine_tune_checkpoint and load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): if (mode == tf.estimator.ModeKeys.EVAL and eval_config.use_dummy_loss_in_eval): total_loss = tf.constant(1.0) losses_dict = {'Loss/total_loss': total_loss} else: losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if use_tpu and train_config.use_bfloat16: regularization_losses = ops.bfloat16_to_float32_nested( regularization_losses) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = slim.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = [ 'gradients', 'gradient_norm', 'global_gradient_norm' ] train_op = slim.optimizers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if fields.InputDataFields.image_additional_channels in features: eval_dict[fields.InputDataFields. image_additional_channels] = features[ fields.InputDataFields.image_additional_channels] if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: keypoint_edges = [(kp.start, kp.end) for kp in eval_config.keypoint_edge] eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False, keypoint_edges=keypoint_edges or None) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def add_to_collection(trainable_variables, prefix): """Put variables into graph collection.""" for after_block, variables in trainable_variables.items(): collection = prefix + str(after_block) for var in variables: tf.add_to_collection(collection, var)
def get_loss(mask_label, center_label, \ heading_class_label, heading_residual_label, \ size_class_label, size_residual_label, \ end_points, \ corner_loss_weight=10.0, \ box_loss_weight=1.0): ''' Loss functions for 3D object detection. Input: mask_label: TF int32 tensor in shape (B,N) center_label: TF tensor in shape (B,3) heading_class_label: TF int32 tensor in shape (B,) heading_residual_label: TF tensor in shape (B,) size_class_label: TF tensor int32 in shape (B,) size_residual_label: TF tensor tensor in shape (B,) end_points: dict, outputs from our model corner_loss_weight: float scalar box_loss_weight: float scalar Output: total_loss: TF scalar tensor the total_loss is also added to the losses collection ''' # 3D Segmentation loss mask_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=end_points['mask_logits'], labels=mask_label)) tf.summary.scalar('3d mask loss', mask_loss) # Center regression losses center_dist = tf.norm(center_label - end_points['center'], axis=-1) center_loss = huber_loss(center_dist, delta=2.0) tf.summary.scalar('center loss', center_loss) stage1_center_dist = tf.norm(center_label - \ end_points['stage1_center'], axis=-1) stage1_center_loss = huber_loss(stage1_center_dist, delta=1.0) tf.summary.scalar('stage1 center loss', stage1_center_loss) # Heading loss heading_class_loss = tf.reduce_mean( \ tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=end_points['heading_scores'], labels=heading_class_label)) tf.summary.scalar('heading class loss', heading_class_loss) hcls_onehot = tf.one_hot(heading_class_label, depth=NUM_HEADING_BIN, on_value=1, off_value=0, axis=-1) # BxNUM_HEADING_BIN heading_residual_normalized_label = \ heading_residual_label / (np.pi/NUM_HEADING_BIN) heading_residual_normalized_loss = huber_loss(tf.reduce_sum( \ end_points['heading_residuals_normalized']*tf.to_float(hcls_onehot), axis=1) - \ heading_residual_normalized_label, delta=1.0) tf.summary.scalar('heading residual normalized loss', heading_residual_normalized_loss) # Size loss size_class_loss = tf.reduce_mean( \ tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=end_points['size_scores'], labels=size_class_label)) tf.summary.scalar('size class loss', size_class_loss) scls_onehot = tf.one_hot(size_class_label, depth=NUM_SIZE_CLUSTER, on_value=1, off_value=0, axis=-1) # BxNUM_SIZE_CLUSTER scls_onehot_tiled = tf.tile(tf.expand_dims( \ tf.to_float(scls_onehot), -1), [1,1,3]) # BxNUM_SIZE_CLUSTERx3 predicted_size_residual_normalized = tf.reduce_sum( \ end_points['size_residuals_normalized']*scls_onehot_tiled, axis=[1]) # Bx3 mean_size_arr_expand = tf.expand_dims( \ tf.constant(g_mean_size_arr, dtype=tf.float32),0) # 1xNUM_SIZE_CLUSTERx3 mean_size_label = tf.reduce_sum( \ scls_onehot_tiled * mean_size_arr_expand, axis=[1]) # Bx3 size_residual_label_normalized = size_residual_label / mean_size_label size_normalized_dist = tf.norm( \ size_residual_label_normalized - predicted_size_residual_normalized, axis=-1) size_residual_normalized_loss = huber_loss(size_normalized_dist, delta=1.0) tf.summary.scalar('size residual normalized loss', size_residual_normalized_loss) # Corner loss # We select the predicted corners corresponding to the # GT heading bin and size cluster. corners_3d = get_box3d_corners( end_points['center'], end_points['heading_residuals'], end_points['size_residuals']) # (B,NH,NS,8,3) gt_mask = tf.tile(tf.expand_dims(hcls_onehot, 2), [1,1,NUM_SIZE_CLUSTER]) * \ tf.tile(tf.expand_dims(scls_onehot,1), [1,NUM_HEADING_BIN,1]) # (B,NH,NS) corners_3d_pred = tf.reduce_sum( \ tf.to_float(tf.expand_dims(tf.expand_dims(gt_mask,-1),-1)) * corners_3d, axis=[1,2]) # (B,8,3) heading_bin_centers = tf.constant( \ np.arange(0,2*np.pi,2*np.pi/NUM_HEADING_BIN), dtype=tf.float32) # (NH,) heading_label = tf.expand_dims(heading_residual_label,1) + \ tf.expand_dims(heading_bin_centers, 0) # (B,NH) heading_label = tf.reduce_sum(tf.to_float(hcls_onehot) * heading_label, 1) mean_sizes = tf.expand_dims( \ tf.constant(g_mean_size_arr, dtype=tf.float32), 0) # (1,NS,3) size_label = mean_sizes + \ tf.expand_dims(size_residual_label, 1) # (1,NS,3) + (B,1,3) = (B,NS,3) size_label = tf.reduce_sum( \ tf.expand_dims(tf.to_float(scls_onehot),-1)*size_label, axis=[1]) # (B,3) corners_3d_gt = get_box3d_corners_helper( \ center_label, heading_label, size_label) # (B,8,3) corners_3d_gt_flip = get_box3d_corners_helper( \ center_label, heading_label+np.pi, size_label) # (B,8,3) corners_dist = tf.minimum( tf.norm(corners_3d_pred - corners_3d_gt, axis=-1), tf.norm(corners_3d_pred - corners_3d_gt_flip, axis=-1)) corners_loss = huber_loss(corners_dist, delta=1.0) tf.summary.scalar('corners loss', corners_loss) # Weighted sum of all losses total_loss = mask_loss + box_loss_weight * (center_loss + \ heading_class_loss + size_class_loss + \ heading_residual_normalized_loss*20 + \ size_residual_normalized_loss*20 + \ stage1_center_loss + \ corner_loss_weight*corners_loss) tf.add_to_collection('losses', total_loss) return total_loss
def train(self, input_fn, checkpoint_path=None, save_checkpoint_steps=None, save_checkpoint_secs=None): if self._cluster_spec is not None: device_fn = tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % self._worker_rank, merge_devices=True, cluster=self._cluster_spec) cluster_def = self._cluster_spec.as_cluster_def() local_address = self._cluster_spec.job_tasks('worker')[ self._worker_rank] server = tf.train.Server(tf.train.ClusterSpec( {'local': { 0: local_address }}), job_name='local', task_index=0) target = 'grpc://' + local_address else: device_fn = None cluster_def = None target = None config = tf.ConfigProto(cluster_def=cluster_def) config.inter_op_parallelism_threads = 4 config.intra_op_parallelism_threads = 4 config.experimental.share_session_state_in_clusterspec_propagation \ = True tf.config.set_soft_device_placement(False) with tf.Graph().as_default() as g: with tf.device(device_fn): features, labels = self._get_features_and_labels_from_input_fn( input_fn, ModeKeys.TRAIN) spec, _ = self._get_model_spec(features, labels, ModeKeys.TRAIN) # Explicitly add a Saver if not tf.get_collection(tf.GraphKeys.SAVERS): saver = tf.train.Saver( sharded=True, defer_build=True, save_relative_paths=True) # Must set for portability tf.add_to_collection(tf.GraphKeys.SAVERS, saver) listener = DataCheckpointSaverListener(self._trainer_master, self._application_id) saver_hook = tf.estimator.CheckpointSaverHook( checkpoint_path, save_secs=save_checkpoint_secs, save_steps=save_checkpoint_steps, listeners=[listener]) self._bridge.connect() try: with tf.train.MonitoredTrainingSession( master=target, config=config, is_chief=(self._worker_rank == 0), chief_only_hooks=[saver_hook], checkpoint_dir=checkpoint_path, save_checkpoint_steps=None, save_checkpoint_secs=None, hooks=spec.training_hooks) as sess: iter_id = 0 data_checkpoint_value = None if hasattr(saver_hook, "data_checkpoint"): data_checkpoint_value = saver_hook.data_checkpoint if not self._restore_datablock(data_checkpoint_value): raise ValueError("Restore data checkpoint error") while not sess.should_stop(): self._bridge.start(iter_id) logging.debug('after bridge start.') start_time = time.time() sess.run(spec.train_op, feed_dict={}) end_time = time.time() metrics.emit_timer( name="iter_timer", value=end_time-start_time, tags={}) logging.debug('after session run.') self._bridge.commit() logging.debug('after bridge commit.') iter_id += 1 finally: self._bridge.terminate() return self