def tfprint(tensor, fun=None, prefix=""): if fun is None: fun = lambda x: x return tf.Print(tensor, [fun(tensor)], prefix)
def get_masked_sent_lm_output(bert_config, input_tensor, cur_sent_reps_doc_unmask, sent_masked_positions, sent_masked_weights, debugging=False): """Get the sentence level masked LM loss. Args: bert_config: BertConfig object. The configuration file for the document level BERT model. input_tensor: float Tensor. The contextualized representations of all sentences learned by the document level BERT model. The shape is [batch, loop_sent_number_per_doc, hidden]. This is the model prediction. cur_sent_reps_doc_unmask: float Tensor. The unmasked sentence representations of the current document. The shape is [batch, loop_sent_number_per_doc, hidden]. This is the source of the ground truth and negative examples in the masked sentence prediction. sent_masked_positions: int Tensor. The masked sentence positions in the current document. The shape is [batch, max_masked_sent_per_doc]. sent_masked_weights: float Tensor. The masked sentence weights in the current document. The shape is [batch, max_masked_sent_per_doc]. debugging: bool. Whether it is in the debugging mode. Returns: The masked sentence LM loss and the mask sentence LM loss per example. """ # The current method for masked sentence prediction: we approach this problem # as a multi-class classification problem similar to the masked word LM task. # For each masked sentence position, the sentence in the current position is # the positive example. The other co-masked sentences in the current document # and in the other documents of the same batch are the negative examples. We # compute the cross entropy loss over the sentence prediction task following # the implementation of the masked word LM loss in the BERT model. input_tensor_shape = modeling.get_shape_list(input_tensor) batch_size = input_tensor_shape[0] masked_position_shape = modeling.get_shape_list(sent_masked_positions) max_predictions_per_seq = masked_position_shape[1] # In the context of masked sentence prediction, the max_predictions_per_seq # is the same with max_masked_sent_per_doc. # Output Shape: [batch * max_predictions_per_seq, hidden]. # Input_tensor is the model prediction for each position. input_tensor = gather_indexes(input_tensor, sent_masked_positions) # Independent_sent_embeddings is the ground truth input sentence embeddings # for the document level BERT model. The output shape is [batch * # max_predictions_per_seq, hidden]. independent_sent_embeddings = gather_indexes(cur_sent_reps_doc_unmask, sent_masked_positions) with tf.variable_scope("cls/sent_predictions", reuse=tf.AUTO_REUSE): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) # Output Shape: [batch * max_predictions_per_seq, hidden]. input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each predicted position. output_bias = tf.get_variable( "output_bias", shape=[batch_size * max_predictions_per_seq], initializer=tf.zeros_initializer()) # Shape of input_tensor [batch * max_predictions_per_seq, hidden]. # Shape of independent_sent_embeddings is [batch * max_predictions_per_seq, # hidden]. # Shape of logits: [batch * max_predictions_per_seq, # batch * max_predictions_per_seq]. logits = tf.matmul( input_tensor, independent_sent_embeddings, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) # Output Shape: [batch * max_predictions_per_seq, # batch * max_predictions_per_seq]. log_probs = tf.nn.log_softmax(logits, axis=-1) # Output Shape: [batch * max_predictions_per_seq]. # Double checked the setting of label_ids here. The label_ids # should be the label index in the "sentence vocabulary". Thus if batch=32, # max_predictions_per_seq = 2, then label ids should be like # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ..., 63]. For the ground truth one hot # label matrix, only the values in the diagonal positions are 1. All the # other positions should be 0. label_ids = tf.range( 0, batch_size * max_predictions_per_seq, dtype=tf.int32) if debugging: label_ids = tf.Print( label_ids, [label_ids], message="label_ids in get_masked_sent_lm_output", summarize=30) # Output Shape: [batch * max_predictions_per_seq]. # The label_weights is the flatten vector based on sent_masked_weights, # where the weight is 1.0 for sampled real sentences and 0.0 for sampled # masked sentences. label_weights = tf.reshape(sent_masked_weights, [-1]) # Output Shape: [batch * max_predictions_per_seq, # batch * max_predictions_per_seq]. one_hot_labels = tf.one_hot( label_ids, depth=batch_size * max_predictions_per_seq, dtype=tf.float32) # Output Shape: [batch * max_predictions_per_seq]. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # Output Shape: [1]. numerator = tf.reduce_sum(label_weights * per_example_loss) # Output Shape: [1]. denominator = tf.reduce_sum(label_weights) + 1e-5 # Output Shape: [1]. loss = numerator / denominator # Shape of loss [1]. # Shape of per_example_loss is [batch * max_predictions_per_seq]. return (loss, per_example_loss, log_probs)
def _define_collect(batch_env, ppo_hparams, scope, frame_stack_size, eval_phase, sampling_temp, force_beginning_resets, distributional_size=1): """Collect trajectories. Args: batch_env: Batch environment. ppo_hparams: PPO hparams, defined in tensor2tensor.models.research.rl. scope: var scope. frame_stack_size: Number of last observations to feed into the policy. eval_phase: TODO(koz4k): Write docstring. sampling_temp: Sampling temperature for the policy. force_beginning_resets: Whether to reset at the beginning of each episode. distributional_size: optional, number of buckets in distributional RL. Returns: Returns memory (observations, rewards, dones, actions, pdfs, values_functions) containing a rollout of environment from nested wrapped structure. """ epoch_length = ppo_hparams.epoch_length to_initialize = [] with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): num_agents = batch_env.batch_size to_initialize.append(batch_env) wrappers = [(StackWrapper, { "history": frame_stack_size }), (_MemoryWrapper, {})] rollout_metadata = None speculum = None for w in wrappers: tf.logging.info("Applying wrapper %s(%s) to env %s." % (str(w[0]), str(w[1]), str(batch_env))) batch_env = w[0](batch_env, **w[1]) to_initialize.append(batch_env) rollout_metadata = _rollout_metadata(batch_env, distributional_size) speculum = batch_env.speculum def initialization_lambda(sess): for batch_env in to_initialize: batch_env.initialize(sess) memory = [ tf.get_variable( # pylint: disable=g-complex-comprehension "collect_memory_%d_%s" % (epoch_length, name), shape=[epoch_length] + shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=False) for (shape, dtype, name) in rollout_metadata ] cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env), trainable=False) eval_phase_t = tf.convert_to_tensor(eval_phase) should_reset_var = tf.Variable(True, trainable=False) zeros_tensor = tf.zeros(len(batch_env)) force_beginning_resets = tf.convert_to_tensor(force_beginning_resets) def reset_ops_group(): return tf.group(batch_env.reset(tf.range(len(batch_env))), tf.assign(cumulative_rewards, zeros_tensor)) reset_op = tf.cond( tf.logical_or(should_reset_var.read_value(), force_beginning_resets), reset_ops_group, tf.no_op) with tf.control_dependencies([reset_op]): reset_once_op = tf.assign(should_reset_var, False) with tf.control_dependencies([reset_once_op]): def step(index, scores_sum, scores_num): """Single step.""" index %= epoch_length # Only needed in eval runs. # Note - the only way to ensure making a copy of tensor is to run simple # operation. We are waiting for tf.copy: # https://github.com/tensorflow/tensorflow/issues/11186 obs_copy = batch_env.observ + 0 value_fun_shape = (num_agents, ) if distributional_size > 1: value_fun_shape = (num_agents, distributional_size) def env_step(arg1, arg2, arg3): # pylint: disable=unused-argument """Step of the environment.""" (logits, value_function) = get_policy(obs_copy, ppo_hparams, batch_env.action_space, distributional_size) action = common_layers.sample_with_temperature( logits, sampling_temp) action = tf.cast(action, tf.int32) action = tf.reshape(action, shape=(num_agents, )) reward, done = batch_env.simulate(action) pdf = tfp.distributions.Categorical(logits=logits).prob(action) pdf = tf.reshape(pdf, shape=(num_agents, )) value_function = tf.reshape(value_function, shape=value_fun_shape) done = tf.reshape(done, shape=(num_agents, )) with tf.control_dependencies([reward, done]): return tf.identity(pdf), tf.identity(value_function), \ tf.identity(done) # TODO(piotrmilos): while_body is executed at most once, # thus should be replaced with tf.cond pdf, value_function, top_level_done = tf.while_loop( lambda _1, _2, _3: tf.equal(speculum.size(), 0), env_step, [ tf.constant(0.0, shape=(num_agents, )), tf.constant(0.0, shape=value_fun_shape), tf.constant(False, shape=(num_agents, )) ], parallel_iterations=1, back_prop=False, ) with tf.control_dependencies([pdf, value_function]): obs, reward, done, action = speculum.dequeue() to_save = [obs, reward, done, action, pdf, value_function] save_ops = [ tf.scatter_update(memory_slot, index, value) for memory_slot, value in zip(memory, to_save) ] cumulate_rewards_op = cumulative_rewards.assign_add(reward) agent_indices_to_reset = tf.where(top_level_done)[:, 0] with tf.control_dependencies([cumulate_rewards_op]): # TODO(piotrmilos): possibly we need cumulative_rewards.read_value() scores_sum_delta = tf.reduce_sum( tf.gather(cumulative_rewards.read_value(), agent_indices_to_reset)) scores_num_delta = tf.count_nonzero(done, dtype=tf.int32) with tf.control_dependencies(save_ops + [scores_sum_delta, scores_num_delta]): reset_env_op = batch_env.reset(agent_indices_to_reset) reset_cumulative_rewards_op = tf.scatter_update( cumulative_rewards, agent_indices_to_reset, tf.gather(zeros_tensor, agent_indices_to_reset)) with tf.control_dependencies( [reset_env_op, reset_cumulative_rewards_op]): return [ index + 1, scores_sum + scores_sum_delta, scores_num + scores_num_delta ] def stop_condition(i, _, resets): return tf.cond(eval_phase_t, lambda: resets < num_agents, lambda: i < epoch_length) init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] index, scores_sum, scores_num = tf.while_loop(stop_condition, step, init, parallel_iterations=1, back_prop=False) # We handle force_beginning_resets differently. We assume that all envs are # reseted at the end of episod (though it happens at the beginning of the # next one scores_num = tf.cond(force_beginning_resets, lambda: scores_num + len(batch_env), lambda: scores_num) with tf.control_dependencies([scores_sum]): scores_sum = tf.cond( force_beginning_resets, lambda: scores_sum + tf.reduce_sum( cumulative_rewards.read_value()), lambda: scores_sum) mean_score = tf.cond(tf.greater(scores_num, 0), lambda: scores_sum / tf.cast(scores_num, tf.float32), lambda: 0.) printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ") with tf.control_dependencies([index, printing]): memory = [mem.read_value() for mem in memory] # When generating real data together with PPO training we must use single # agent. For PPO to work we reshape the history, as if it was generated # by real_ppo_effective_num_agents. if ppo_hparams.effective_num_agents is not None and not eval_phase: new_memory = [] effective_num_agents = ppo_hparams.effective_num_agents assert epoch_length % ppo_hparams.effective_num_agents == 0, ( "The rollout of ppo_hparams.epoch_length will be distributed amongst" "effective_num_agents of agents") new_epoch_length = int(epoch_length / effective_num_agents) for mem, info in zip(memory, rollout_metadata): shape, _, name = info new_shape = [effective_num_agents, new_epoch_length ] + shape[1:] perm = list(range(len(shape) + 1)) perm[0] = 1 perm[1] = 0 mem = tf.transpose(mem, perm=perm) mem = tf.reshape(mem, shape=new_shape) mem = tf.transpose(mem, perm=perm, name="collect_memory_%d_%s" % (new_epoch_length, name)) new_memory.append(mem) memory = new_memory with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): mean_score_summary = tf.cond( tf.greater(scores_num, 0), lambda: tf.summary.scalar("mean_score_this_iter", mean_score), str) summaries = tf.summary.merge([ mean_score_summary, tf.summary.scalar("episodes_finished_this_iter", scores_num) ]) return memory, summaries, initialization_lambda
def main(_): with tf.Graph().as_default(): # Create inputs in [0, 1], as expected by vgg_16. inputs, _ = image_utils.imagenet_inputs(FLAGS.batch_size, FLAGS.image_size) evaluation_images = image_utils.load_evaluation_images( FLAGS.image_size) # Process style and weight flags if FLAGS.style_coefficients is None: style_coefficients = [1.0 for _ in range(FLAGS.num_styles)] else: style_coefficients = ast.literal_eval(FLAGS.style_coefficients) if len(style_coefficients) != FLAGS.num_styles: raise ValueError( 'number of style coefficients differs from number of styles') content_weights = ast.literal_eval(FLAGS.content_weights) style_weights = ast.literal_eval(FLAGS.style_weights) # Load style images. style_images, labels, style_gram_matrices = image_utils.style_image_inputs( os.path.expanduser(FLAGS.style_dataset_file), batch_size=FLAGS.num_styles, image_size=FLAGS.image_size, square_crop=True, shuffle=False) labels = tf.unstack(labels) def _create_normalizer_params(style_label): """Creates normalizer parameters from a style label.""" return { 'labels': tf.expand_dims(style_label, 0), 'num_categories': FLAGS.num_styles, 'center': True, 'scale': True } # Dummy call to simplify the reuse logic model.transform(inputs, alpha=FLAGS.alpha, reuse=False, normalizer_params=_create_normalizer_params(labels[0])) def _style_sweep(inputs): """Transfers all styles onto the input one at a time.""" inputs = tf.expand_dims(inputs, 0) stylized_inputs = [] for _, style_label in enumerate(labels): stylized_input = model.transform( inputs, alpha=FLAGS.alpha, reuse=True, normalizer_params=_create_normalizer_params(style_label)) stylized_inputs.append(stylized_input) return tf.concat([inputs] + stylized_inputs, 0) if FLAGS.style_grid: style_row = tf.concat([ tf.ones([1, FLAGS.image_size, FLAGS.image_size, 3]), style_images ], 0) stylized_training_example = _style_sweep(inputs[0]) stylized_evaluation_images = [ _style_sweep(image) for image in tf.unstack(evaluation_images) ] stylized_noise = _style_sweep( tf.random_uniform([FLAGS.image_size, FLAGS.image_size, 3])) stylized_style_images = [ _style_sweep(image) for image in tf.unstack(style_images) ] if FLAGS.style_crossover: grid = tf.concat( [style_row, stylized_training_example, stylized_noise] + stylized_evaluation_images + stylized_style_images, 0) else: grid = tf.concat( [style_row, stylized_training_example, stylized_noise] + stylized_evaluation_images, 0) if FLAGS.style_crossover: grid_shape = [ 3 + evaluation_images.get_shape().as_list()[0] + FLAGS.num_styles, 1 + FLAGS.num_styles ] else: grid_shape = [ 3 + evaluation_images.get_shape().as_list()[0], 1 + FLAGS.num_styles ] tf.summary.image( 'Style Grid', tf.cast( image_utils.form_image_grid( grid, grid_shape, [FLAGS.image_size, FLAGS.image_size], 3) * 255.0, tf.uint8)) if FLAGS.learning_curves: metrics = {} for i, label in enumerate(labels): gram_matrices = dict( (key, value[i:i + 1]) for key, value in style_gram_matrices.items()) stylized_inputs = model.transform( inputs, alpha=FLAGS.alpha, reuse=True, normalizer_params=_create_normalizer_params(label)) _, loss_dict = learning.total_loss(inputs, stylized_inputs, gram_matrices, content_weights, style_weights, reuse=i > 0) for key, value in loss_dict.items(): metrics['{}_style_{}'.format( key, i)] = slim.metrics.streaming_mean(value) names_values, names_updates = slim.metrics.aggregate_metric_map( metrics) for name, value in names_values.items(): summary_op = tf.summary.scalar(name, value, []) print_op = tf.Print(summary_op, [value], name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, print_op) eval_op = list(names_updates.values()) num_evals = FLAGS.num_evals else: eval_op = None num_evals = 1 slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=os.path.expanduser(FLAGS.train_dir), logdir=os.path.expanduser(FLAGS.eval_dir), eval_op=eval_op, num_evals=num_evals, eval_interval_secs=FLAGS.eval_interval_secs)
import numpy as np import tensorflow.compat.v1 as tf tf.disable_v2_behavior() # create the graph tf.reset_default_graph() x = tf.get_variable('x', shape=(), dtype=tf.float32) f = x ** 2 # logging with tf.Print f = tf.Print(f, [x, f], "x, f:") # say we want to minimize the function f optimizer = tf.train.GradientDescentOptimizer(0.1) step = optimizer.minimize(f) # as all the variables are trainable by defualt with 'trainable'm positional # argument in variable scope, we dont neeed to specify again. # we can get all the trainable variables as follows: tf.trainable_variables() # Making gd steps # create a session and initialize the variables sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) # say we want to take 10 gd steps for i in range(10): #_, curr_x, curr_f = sess.run([step, x, f]) #1st element prints None, Hence ignored #print(curr_x, curr_f) print(sess.run([step, f])) #close the session , You know : safe practice tf.Session.close(sess)
def inception_model_fn(features, labels, mode, params): """Inception v4 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = tensor_transform_fn(features, params['model_transpose_dims']) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) return logits, end_points if FLAGS.clear_update_collections: with arg_scope( inception.inception_v4_arg_scope( weight_decay=0.0, batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = build_network() else: with arg_scope( inception.inception_v4_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = build_network() predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=tf.cast(end_points['AuxLogits'], tf.float32), weights=0.4, label_smoothing=0.1, scope='aux_loss') tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) losses = tf.add_n(tf.losses.get_losses()) l2_loss = [] for v in tf.trainable_variables(): tf.logging.info(v.name) if 'BatchNorm' not in v.name and 'weights' in v.name: l2_loss.append(tf.nn.l2_loss(v)) tf.logging.info(len(l2_loss)) loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 # Adjust the initial learning rate for warmup initial_learning_rate /= ( FLAGS.learning_rate_decay**((FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs)) final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) clr = FLAGS.cold_learning_rate wlr = initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.cold_epochs) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where( tf.greater_equal(current_epoch, FLAGS.warmup_epochs + FLAGS.cold_epochs), tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int( FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True), tf.multiply(tf.cast(current_epoch, tf.float32), wlr))), clr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) if not FLAGS.skip_host_call: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def debugprint(x, name=''): """Small wrapper for tf.Print which prints summary statistics.""" name += '\t' + x.name return tf.Print(x, [tf.reduce_min(x), tf.reduce_mean(x), tf.reduce_max(x)], name)
def fail_push(): pop = tf.Print(failed_push, [failed_push], "Failed to push") return tf.group(failed_push.assign_add(1), pop, name="fail_push")
def build_smith_dual_encoder(dual_encoder_config, train_mode, is_training, input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2, use_one_hot_embeddings, documents_match_labels, debugging=False): """Build the dual encoder SMITH model. Args: dual_encoder_config: the configuration file for the dual encoder model. train_mode: string. The train mode of the current. It can be finetune, pretrain or joint_train. is_training: bool. Whether it in training mode. input_ids_1: int Tensor with shape [batch, max_seq_length]. The input ids of input examples of text 1. input_mask_1: int Tensor with shape [batch, max_seq_length]. The input masks of input examples of text 1. masked_lm_positions_1: int Tensor with shape [batch, max_predictions_per_seq]. The input masked LM prediction positions of input examples of text 1. This can be useful to compute the masked word prediction LM loss. masked_lm_ids_1: int Tensor with shape [batch, max_predictions_per_seq]. The input masked LM prediction ids of input examples of text 1. It is the ground truth in the masked word LM prediction task. This can be useful to compute the masked word prediction LM loss. masked_lm_weights_1: float Tensor with shape [batch, max_predictions_per_seq]. The input masked LM prediction weights of input examples of text 1. input_ids_2: int Tensor with shape [batch, max_seq_length]. The input ids of input examples of text 2. input_mask_2: int Tensor with shape [batch, max_seq_length]. The input masks of input examples of text 2. masked_lm_positions_2: int Tensor with shape [batch, max_predictions_per_seq]. The input masked LM prediction positions of input examples of text 2. This can be useful to compute the masked word prediction LM loss. masked_lm_ids_2: int Tensor with shape [batch, max_predictions_per_seq]. The input masked LM prediction ids of input examples of text 2. It is the ground truth in the masked word LM prediction task. This can be useful to compute the masked word prediction LM loss. masked_lm_weights_2: float Tensor with shape [batch, max_predictions_per_seq]. The input masked LM prediction weights of input examples of text 2. use_one_hot_embeddings: bool. Whether use one hot embeddings. documents_match_labels: float Tensor with shape [batch]. The ground truth labels for the input examples. debugging: bool. Whether it is in the debugging mode. Returns: The masked LM loss, per example LM loss, masked sentence LM loss, per example masked sentence LM loss, sequence representations, text matching loss, per example text matching loss, text matching logits, text matching probabilities and text matching log probabilities. Raises: ValueError: if the doc_rep_combine_mode in dual_encoder_config is invalid. """ bert_config = modeling.BertConfig.from_json_file( dual_encoder_config.encoder_config.bert_config_file) doc_bert_config = modeling.BertConfig.from_json_file( dual_encoder_config.encoder_config.doc_bert_config_file) (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, masked_lm_loss_doc_1, masked_lm_loss_doc_2, masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, masked_lm_weights_doc_1, masked_lm_weights_doc_2) = layers.learn_sent_reps_normal_loop( dual_encoder_config, is_training, train_mode, input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2, use_one_hot_embeddings) if debugging: input_mask_doc_level_1_tensor = tf.Print( input_mask_doc_level_1_tensor, [input_mask_doc_level_1_tensor, input_mask_doc_level_2_tensor], message="input_mask_doc_level_1_tensor in build_smith_dual_encoder", summarize=30) if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: batch_size_static = ( dual_encoder_config.train_eval_config.train_batch_size if is_training else dual_encoder_config.train_eval_config.eval_batch_size) # Generates the sentence masked document represenations. with tf.variable_scope("mask_sent_in_doc", reuse=tf.AUTO_REUSE): # Randomly initialize a masked sentence vector and reuse it. # We also need to return the masked sentence position index to get the # ground truth labels for the masked positions. The shape of # sent_mask_embedding is [hidden]. sent_mask_embedding = tf.get_variable( name="sentence_mask_embedding", shape=[bert_config.hidden_size], initializer=tf.truncated_normal_initializer( stddev=bert_config.initializer_range)) # Output Shape: [batch, loop_sent_number_per_doc, hidden]. (input_sent_reps_doc_1_masked, masked_sent_index_1, masked_sent_weight_1) = layers.get_doc_rep_with_masked_sent( input_sent_reps_doc=input_sent_reps_doc_1_unmask, sent_mask_embedding=sent_mask_embedding, input_mask_doc_level=input_mask_doc_level_1_tensor, batch_size_static=batch_size_static, max_masked_sent_per_doc=dual_encoder_config.encoder_config. max_masked_sent_per_doc, loop_sent_number_per_doc=dual_encoder_config.encoder_config. loop_sent_number_per_doc) (input_sent_reps_doc_2_masked, masked_sent_index_2, masked_sent_weight_2) = layers.get_doc_rep_with_masked_sent( input_sent_reps_doc=input_sent_reps_doc_2_unmask, sent_mask_embedding=sent_mask_embedding, input_mask_doc_level=input_mask_doc_level_2_tensor, batch_size_static=batch_size_static, max_masked_sent_per_doc=dual_encoder_config.encoder_config. max_masked_sent_per_doc, loop_sent_number_per_doc=dual_encoder_config.encoder_config. loop_sent_number_per_doc) # Learn the document representations based on masked sentence embeddings. # Note that the variables in the DocBert model are not within the # "mask_sent_in_doc" variable scope. model_doc_1 = modeling.DocBertModel( config=doc_bert_config, is_training=is_training, input_reps=input_sent_reps_doc_1_masked, input_mask=input_mask_doc_level_1_tensor) model_doc_2 = modeling.DocBertModel( config=doc_bert_config, is_training=is_training, input_reps=input_sent_reps_doc_2_masked, input_mask=input_mask_doc_level_2_tensor) # Shape of masked_sent_lm_loss_1 [1]. # Shape of masked_sent_lm_example_loss_1 is [batch * # max_predictions_per_seq]. (masked_sent_lm_loss_1, masked_sent_per_example_loss_1, _) = layers.get_masked_sent_lm_output( doc_bert_config, model_doc_1.get_sequence_output(), input_sent_reps_doc_1_unmask, masked_sent_index_1, masked_sent_weight_1) (masked_sent_lm_loss_2, masked_sent_per_example_loss_2, _) = layers.get_masked_sent_lm_output( doc_bert_config, model_doc_2.get_sequence_output(), input_sent_reps_doc_2_unmask, masked_sent_index_2, masked_sent_weight_2) else: # Learn the document representations based on unmasked sentence embeddings. model_doc_1 = modeling.DocBertModel( config=doc_bert_config, is_training=is_training, input_reps=input_sent_reps_doc_1_unmask, input_mask=input_mask_doc_level_1_tensor) model_doc_2 = modeling.DocBertModel( config=doc_bert_config, is_training=is_training, input_reps=input_sent_reps_doc_2_unmask, input_mask=input_mask_doc_level_2_tensor) masked_sent_lm_loss_1 = 0 masked_sent_lm_loss_2 = 0 masked_sent_per_example_loss_1 = tf.zeros(1) masked_sent_per_example_loss_2 = tf.zeros(1) masked_sent_weight_1 = tf.zeros(1) masked_sent_weight_2 = tf.zeros(1) with tf.variable_scope("seq_rep_from_bert_doc_dense", reuse=tf.AUTO_REUSE): normalized_doc_rep_1 = layers.get_seq_rep_from_bert(model_doc_1) normalized_doc_rep_2 = layers.get_seq_rep_from_bert(model_doc_2) # We also dump the contextualized sentence embedding output by document # level Transformer model. These representations maybe useful for sentence # level tasks. output_sent_reps_doc_1 = model_doc_1.get_sequence_output() output_sent_reps_doc_2 = model_doc_2.get_sequence_output() # Here we support multiple modes to generate the final document # representations based on the word/sentence/document level representations # 1. normal: only use the document level representation as the final document # representations. # 2. sum_concat: firstly compute the sum of all sentence level repsentations. # Then concatenate the sum vector with the document level representations. # 3. mean_concat: firstly compute the mean of all sentence level # repsentations. Then concatenate the mean vector with the document level # representations. # 4. attention: firstly compute the weighted sum of sentence level # representations with attention mechanism, then concatenate the weighted sum # vector with the document level representations. # The document level mask is to indicate whether each sentence is # a real sentence (1) or a paded sentence (0). The shape of # input_mask_doc_level_1_tensor is [batch, max_doc_length_by_sentence]. The # shape of input_sent_reps_doc_1_unmask is # [batch, max_doc_length_by_sentence, hidden]. final_doc_rep_combine_mode = dual_encoder_config.encoder_config.doc_rep_combine_mode if final_doc_rep_combine_mode == constants.DOC_COMBINE_NORMAL: final_doc_rep_1 = normalized_doc_rep_1 final_doc_rep_2 = normalized_doc_rep_2 elif final_doc_rep_combine_mode == constants.DOC_COMBINE_SUM_CONCAT: # Output Shape: [batch, 2*hidden]. final_doc_rep_1 = tf.concat([ tf.reduce_sum(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1 ], axis=1) final_doc_rep_2 = tf.concat([ tf.reduce_sum(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2 ], axis=1) elif final_doc_rep_combine_mode == constants.DOC_COMBINE_MEAN_CONCAT: final_doc_rep_1 = tf.concat([ tf.reduce_mean(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1 ], axis=1) final_doc_rep_2 = tf.concat([ tf.reduce_mean(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2 ], axis=1) elif final_doc_rep_combine_mode == constants.DOC_COMBINE_ATTENTION: final_doc_rep_1 = tf.concat([ layers.get_attention_weighted_sum( input_sent_reps_doc_1_unmask, bert_config, is_training, dual_encoder_config.encoder_config. doc_rep_combine_attention_size), normalized_doc_rep_1 ], axis=1) final_doc_rep_2 = tf.concat([ layers.get_attention_weighted_sum( input_sent_reps_doc_2_unmask, bert_config, is_training, dual_encoder_config.encoder_config. doc_rep_combine_attention_size), normalized_doc_rep_2 ], axis=1) else: raise ValueError( "Only normal, sum_concat, mean_concat and attention are" " supported: %s" % final_doc_rep_combine_mode) (siamese_loss, siamese_example_loss, siamese_logits) = loss_fns.get_prediction_loss_cosine( input_tensor_1=final_doc_rep_1, input_tensor_2=final_doc_rep_2, labels=documents_match_labels, similarity_score_amplifier=dual_encoder_config.loss_config. similarity_score_amplifier, neg_to_pos_example_ratio=dual_encoder_config.train_eval_config. neg_to_pos_example_ratio) # The shape of masked_lm_loss_doc is [1]. # The shape of masked_lm_example_loss_doc is [batch * max_predictions_per_seq, # max_doc_length_by_sentence]. return (masked_lm_loss_doc_1, masked_lm_loss_doc_2, masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, masked_lm_weights_doc_1, masked_lm_weights_doc_2, masked_sent_lm_loss_1, masked_sent_lm_loss_2, masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, masked_sent_weight_1, masked_sent_weight_2, final_doc_rep_1, final_doc_rep_2, input_sent_reps_doc_1_unmask, input_sent_reps_doc_2_unmask, output_sent_reps_doc_1, output_sent_reps_doc_2, siamese_loss, siamese_example_loss, siamese_logits)
def compute_teacher_loss(log_q, reward, baseline, std): advantage = tf.abs((reward - baseline) / std) advantage = tf.stop_gradient(advantage) log_q = tf.Print(log_q, [log_q], "log_q: ") teacher_loss = tf.reduce_mean(-log_q * advantage) return teacher_loss
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False, use_grayscale=FLAGS.use_grayscale) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.quantize: contrib_quantize.create_eval_graph() if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def debug_tensor(s, msg=None, summarize=10): """Print the shape and value of a tensor at test time. Return a new tensor.""" if not msg: msg = s.name return tf.Print(s, [tf.shape(s), s], msg + " ", summarize=summarize)
x = TimeDistributed(Flatten())(x) rnn_size = 128 x = Bidirectional( RNN(LSTMCell(rnn_size, recurrent_activation='sigmoid'), return_sequences=True))(x) x = Bidirectional( RNN(LSTMCell(rnn_size, recurrent_activation='sigmoid'), return_sequences=True))(x) x = Dense(n_class, activation='softmax')(x) base_model = Model(inputs=input_tensor, outputs=x) labels = Input(name='the_labels', shape=[n_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') print_node = tf.Print(x, [x], "shape of output") loss_out = Lambda(ctc_lambda_func, output_shape=(1, ), name='ctc')([print_node, labels, input_length, label_length]) model = Model(inputs=[input_tensor, labels, input_length, label_length], outputs=loss_out) model.load_weights(model_path + 'ctc_best.h5') final_notification_info = '' for phone_index, phone in enumerate(phone_list): table = None out = None while table is None:
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary
def define_ppo_epoch(memory, hparams, action_space, batch_size, distributional_size=1, distributional_subscale=0.04, distributional_threshold=0.0, epoch=-1): """PPO epoch.""" observation, reward, done, action, old_pdf, value_sm = memory # This is to avoid propagating gradients through simulated environment. observation = tf.stop_gradient(observation) action = tf.stop_gradient(action) reward = tf.stop_gradient(reward) if hasattr(hparams, "rewards_preprocessing_fun"): reward = hparams.rewards_preprocessing_fun(reward) done = tf.stop_gradient(done) value_sm = tf.stop_gradient(value_sm) old_pdf = tf.stop_gradient(old_pdf) value = value_sm if distributional_size > 1: value = _distributional_to_value( value_sm, distributional_size, distributional_subscale, distributional_threshold) advantage = calculate_generalized_advantage_estimator( reward, value, done, hparams.gae_gamma, hparams.gae_lambda) if distributional_size > 1: # Create discounted reward values range. half = distributional_size // 2 value_range = tf.to_float(tf.range(-half, half)) + 0.5 # Mid-bucket value. value_range *= distributional_subscale # Acquire new discounted rewards by using the above range as end-values. end_values = tf.expand_dims(value_range, 0) discounted_reward = discounted_rewards( reward, done, hparams.gae_gamma, end_values) # Re-normalize the discounted rewards to integers, in [0, dist_size] range. discounted_reward /= distributional_subscale discounted_reward += half discounted_reward = tf.maximum(discounted_reward, 0.0) discounted_reward = tf.minimum(discounted_reward, distributional_size) # Multiply the rewards by 2 for greater fidelity and round to integers. discounted_reward = tf.stop_gradient(tf.round(2 * discounted_reward)) # The probabilities corresponding to the end values from old predictions. discounted_reward_prob = tf.stop_gradient(value_sm[-1]) discounted_reward_prob = tf.nn.softmax(discounted_reward_prob, axis=-1) else: discounted_reward = tf.stop_gradient(advantage + value[:-1]) discounted_reward_prob = discounted_reward # Unused in this case. advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True) advantage_normalized = tf.stop_gradient( (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8)) add_lists_elementwise = lambda l1, l2: [x + y for x, y in zip(l1, l2)] number_of_batches = ((hparams.epoch_length-1) * hparams.optimization_epochs // hparams.optimization_batch_size) epoch_length = hparams.epoch_length if hparams.effective_num_agents is not None: number_of_batches *= batch_size number_of_batches //= hparams.effective_num_agents epoch_length //= hparams.effective_num_agents assert number_of_batches > 0, "Set the paremeters so that number_of_batches>0" lr = learning_rate.learning_rate_schedule(hparams) shuffled_indices = [tf.random.shuffle(tf.range(epoch_length - 1)) for _ in range(hparams.optimization_epochs)] shuffled_indices = tf.concat(shuffled_indices, axis=0) shuffled_indices = shuffled_indices[:number_of_batches * hparams.optimization_batch_size] indices_of_batches = tf.reshape(shuffled_indices, shape=(-1, hparams.optimization_batch_size)) input_tensors = [observation, action, discounted_reward, discounted_reward_prob, advantage_normalized, old_pdf] ppo_step_rets = tf.scan( lambda a, i: add_lists_elementwise( # pylint: disable=g-long-lambda a, define_ppo_step( [tf.gather(t, indices_of_batches[i, :]) for t in input_tensors], hparams, action_space, lr, epoch=epoch, distributional_size=distributional_size, distributional_subscale=distributional_subscale )), tf.range(number_of_batches), [0., 0., 0.], parallel_iterations=1) ppo_summaries = [tf.reduce_mean(ret) / number_of_batches for ret in ppo_step_rets] ppo_summaries.append(lr) summaries_names = [ "policy_loss", "value_loss", "entropy_loss", "learning_rate" ] summaries = [tf.summary.scalar(summary_name, summary) for summary_name, summary in zip(summaries_names, ppo_summaries)] losses_summary = tf.summary.merge(summaries) for summary_name, summary in zip(summaries_names, ppo_summaries): losses_summary = tf.Print(losses_summary, [summary], summary_name + ": ") return losses_summary
def mix_data(example): """Function to mix the different datasets according to a schedule.""" del example # This block computes the probability of mixing the primary task with # the secondary tasks. 0 = only the primary task, 1 = only the secondary # tasks. if hparams.multiproblem_mixing_schedule == MixingSchedule.EXPONENTIAL: prob = get_exp_sched_prob() prob = tf.cond( tf.equal( tf.floormod(problem_step, tf.cast(5e6, dtype=tf.int64)), 0), lambda: tf.Print(prob, [prob], message="Probability"), lambda: prob) elif hparams.multiproblem_mixing_schedule == MixingSchedule.CONSTANT: prob = get_const_sched_prob() elif hparams.multiproblem_mixing_schedule == MixingSchedule.PRETRAIN: prob = get_pretrain_sched_prob() else: raise ValueError("Unknown schedule %s" % str(hparams.multiproblem_mixing_schedule)) tf.logging.info("Using the %s schedule to " "train the MultiProblem." % str(hparams.multiproblem_mixing_schedule)) tf.logging.info("Schedule mixing threshold " "%.2f" % hparams.multiproblem_schedule_threshold) # If per-task thresholds are specified, use them. thresholds = None if hparams.multiproblem_per_task_threshold: thresholds = hparams.multiproblem_per_task_threshold.split( ",") thresholds = [float(t) for t in thresholds] # Convert to floats. thresholds_sum = sum(thresholds) tf.logging.info("Per-task thresholds: %s." % str(thresholds)) thresholds = [t / thresholds_sum for t in thresholds] # Normalize. thresholds = [ sum(thresholds[:i + 1]) for i in range(len(thresholds)) ] tf.logging.info("Per-task threshold sums: %s." % str(thresholds)) if len(thresholds) != len(self.task_list): tf.logging.warn( "Specified %d thresholds but encountered %d tasks." % (len(thresholds), len(self.task_list))) thresholds = None def sample_task(curr_task, num_tasks_left, randnum): """A recursive function to sample a task. This function treats the probability as the threshold for the primary task and divides the remaining probability mass across the other tasks. Args: curr_task: The index of the task being considered for sampling. num_tasks_left: Number of tasks remaining to possibly sample from. randnum: The random number used to select the dataset. Returns: A Tensor representing an example from the task that was sampled from. """ if num_tasks_left == 0: return get_next_from_dataset( dataset_iterators[curr_task]) if thresholds is not None: # Use per-task thresholds if specified. prob_sum = thresholds[curr_task] return tf.cond( randnum < prob_sum, lambda: get_next_from_dataset( dataset_iterators[curr_task]), lambda: sample_task(curr_task + 1, num_tasks_left - 1, randnum)) # When curr_task is 0, the primary task, the new prob is the same as # the original probability. `tf.greater` indicates that the primary # task receives (1-prob) of the probability mass. # Otherwise, `prob` is divided equally amongst all the secondary # tasks. new_prob = prob - (curr_task * prob / (len(self.task_list) - 1)) return tf.cond( tf.greater(randnum, new_prob), lambda: get_next_from_dataset(dataset_iterators[ curr_task]), lambda: sample_task( curr_task + 1, num_tasks_left - 1, randnum)) return tf.data.Dataset.from_tensors( sample_task(0, len(self.task_list) - 1, tf.random_uniform([])))
def parse_example_proto(example_serialized, has_3d=False): """Parses an Example proto. It's contents are: 'image/height' : _int64_feature(height), 'image/width' : _int64_feature(width), 'image/x' : _float_feature(label[0,:].astype(np.float)), 'image/y' : _float_feature(label[1,:].astype(np.float)), 'image/visibility' : _int64_feature(label[2,:].astype(np.int)), 'image/format' : _bytes_feature 'image/filename' : _bytes_feature 'image/encoded' : _bytes_feature 'image/face_points' : _float_feature, this is the 2D keypoints of the face points in coco 5*3 (x,y,vis) = 15 if has_3d is on, it also has: 'mosh/pose' : float_feature(pose.astype(np.float)), 'mosh/shape' : float_feature(shape.astype(np.float)), # gt3d is 14x3 'mosh/gt3d' : float_feature(shape.astype(np.float)), """ feature_map = { 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/height': tf.FixedLenFeature([1], dtype=tf.int64, default_value=-1), 'image/width': tf.FixedLenFeature([1], dtype=tf.int64, default_value=-1), 'image/filename': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/center': tf.FixedLenFeature((2, 1), dtype=tf.int64), 'image/visibility': tf.FixedLenFeature((1, 14), dtype=tf.int64), 'image/x': tf.FixedLenFeature((1, 14), dtype=tf.float32), 'image/y': tf.FixedLenFeature((1, 14), dtype=tf.float32), 'image/face_pts': tf.FixedLenFeature((1, 15), dtype=tf.float32, default_value=[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), } if has_3d: feature_map.update({ 'mosh/pose': tf.FixedLenFeature((72, ), dtype=tf.float32), 'mosh/shape': tf.FixedLenFeature((10, ), dtype=tf.float32), 'mosh/gt3d': tf.FixedLenFeature((14 * 3, ), dtype=tf.float32), # has_3d is for pose and shape: 0 for mpi_inf_3dhp, 1 for h3.6m. 'meta/has_3d': tf.FixedLenFeature((1), dtype=tf.int64, default_value=[0]), }) features = tf.parse_single_example(example_serialized, feature_map) height = tf.cast(features['image/height'], dtype=tf.int32) width = tf.cast(features['image/width'], dtype=tf.int32) center = tf.cast(features['image/center'], dtype=tf.int32) fname = tf.cast(features['image/filename'], dtype=tf.string) fname = tf.Print(fname, [fname], message="image name: ") face_pts = tf.reshape( tf.cast(features['image/face_pts'], dtype=tf.float32), [3, 5]) vis = tf.cast(features['image/visibility'], dtype=tf.float32) x = tf.cast(features['image/x'], dtype=tf.float32) y = tf.cast(features['image/y'], dtype=tf.float32) label = tf.concat([x, y, vis], 0) label = tf.concat([label, face_pts], 1) image = decode_jpeg(features['image/encoded']) image_size = tf.concat([height, width], 0) if has_3d: pose = tf.cast(features['mosh/pose'], dtype=tf.float32) shape = tf.cast(features['mosh/shape'], dtype=tf.float32) gt3d = tf.reshape(tf.cast(features['mosh/gt3d'], dtype=tf.float32), [14, 3]) has_smpl3d = tf.cast(features['meta/has_3d'], dtype=tf.bool) return image, image_size, label, center, fname, pose, shape, gt3d, has_smpl3d else: return image, image_size, label, center, fname
def knn_affinity(input_x, n_nbrs, scale=None, scale_nbr=None, local_scale=None, verbose=False): """Calculates Gaussian affinity matrix. Calculates the symmetrized Gaussian affinity matrix with k1 nonzero affinities for each point, scaled by 1) a provided scale, 2) the median distance of the k2-th neighbor of each point in X, or 3) a covariance matrix S where S_ii is the distance of the k2-th neighbor of each point i, and S_ij = 0 for all i != j Here, k1 = n_nbrs, k2 = scale_nbr Args: input_x: input dataset of size n n_nbrs: k1 scale: provided scale scale_nbr: k2, used if scale not provided local_scale: if True, then we use the aforementioned option 3), else we use option 2) verbose: extra printouts Returns: n x n affinity matrix """ if isinstance(n_nbrs, np.float): n_nbrs = int(n_nbrs) elif isinstance(n_nbrs, tf.Variable) and n_nbrs.dtype.as_numpy_dtype != np.int32: n_nbrs = tf.cast(n_nbrs, np.int32) # get squared distance dist_x = squared_distance(input_x) # calculate the top k losest neighbors nn = tf.nn.top_k(-dist_x, n_nbrs, sorted=True) vals = nn[0] # apply scale if scale is None: # if scale not provided, use local scale if scale_nbr is None: scale_nbr = 0 else: assert scale_nbr > 0 and scale_nbr <= n_nbrs if local_scale: scale = -nn[0][:, scale_nbr - 1] scale = tf.reshape(scale, [-1, 1]) scale = tf.tile(scale, [1, n_nbrs]) scale = tf.reshape(scale, [-1, 1]) vals = tf.reshape(vals, [-1, 1]) if verbose: vals = tf.Print(vals, [tf.shape(vals), tf.shape(scale)], 'vals, scale shape') vals = vals / (2 * scale) vals = tf.reshape(vals, [-1, n_nbrs]) else: def get_median(scales, m): with tf.device('/cpu:0'): scales = tf.nn.top_k(scales, m)[0] scale = scales[m - 1] return scale, scales scales = -vals[:, scale_nbr - 1] const = tf.shape(input_x)[0] // 2 scale, scales = get_median(scales, const) vals = vals / (2 * scale) else: # otherwise, use provided value for global scale vals = vals / (2 * scale**2) # get the affinity aff_vals = tf.exp(vals) # flatten this into a single vector of values to shove in a sparse matrix aff_vals = tf.reshape(aff_vals, [-1]) # get the matrix of indices corresponding to each rank # with 1 in the first column and k in the kth column nn_ind = nn[1] # get the j index for the sparse matrix j_index = tf.reshape(nn_ind, [-1, 1]) # the i index is just sequential to the j matrix i_index = tf.range(tf.shape(nn_ind)[0]) i_index = tf.reshape(i_index, [-1, 1]) i_index = tf.tile(i_index, [1, tf.shape(nn_ind)[1]]) i_index = tf.reshape(i_index, [-1, 1]) # concatenate the indices to build the sparse matrix indices = tf.concat((i_index, j_index), axis=1) # assemble the sparse weight matrix weight_mat = tf.SparseTensor( indices=tf.cast(indices, dtype='int64'), values=aff_vals, dense_shape=tf.cast(tf.shape(dist_x), dtype='int64')) # fix the ordering of the indices weight_mat = tf.sparse_reorder(weight_mat) # convert to dense tensor weight_mat = tf.sparse_tensor_to_dense(weight_mat) # symmetrize weight_mat = (weight_mat + tf.transpose(weight_mat)) / 2.0 return weight_mat
def train(self, sess): """Main training function/loop. Args: sess: a tf session object """ # For debugging/pushing limits of model gpu_mb = tf.constant(1024*1024, dtype=tf.int64) gpus = tf.config.experimental.list_logical_devices("GPU") memory_footprints = [] for gpu in gpus: with tf.device(gpu.name): memory_footprint = tf.Print( tf.constant(0), [ contrib_memory_stats.BytesLimit() / gpu_mb, contrib_memory_stats.MaxBytesInUse() / gpu_mb ], message=gpu.name) memory_footprints.append(memory_footprint) epochs = FLAGS.num_epochs prints = FLAGS.log_frequency training_start_time = time.time() epochs_start_time = time.time() num_batches = max(int(len(self.train_examples)/self.batch_size), 1) tf.logging.info("Num batches per epoch: {}".format(num_batches)) # Additional logging losses = np.zeros((epochs * num_batches)) accuracies = np.zeros((epochs * num_batches)) for epoch in range(epochs): random.shuffle(self.train_examples) for batch in range(num_batches): batch_no = epoch * num_batches + batch should_sample = (batch_no % prints == 0) train_ops_to_run = { "train_step": self.train_step, "loss": self.model.loss, "accuracy": self.model.accuracy, "accuracy_per_example": self.model.accuracy_per_ex, "output_relations": self.model.log_decoded_relations, } if should_sample: train_ops_to_run["props"] = self.model.property_loss train_ops_to_run["regularization"] = self.model.regularization for i, memory_footprint in enumerate(memory_footprints): train_ops_to_run["memory_footprint_{}".format(i)] = memory_footprint batch_examples = self.train_examples[batch: batch + self.batch_size] feed_dict = self._compute_feed_dict(batch_examples) train_output = sess.run(train_ops_to_run, feed_dict) losses[batch_no] = train_output["loss"] accuracies[batch_no] = train_output["accuracy"] if should_sample: # Timing info epochs_end_time = time.time() epochs_time_str = str(datetime.timedelta( seconds=epochs_end_time - epochs_start_time)) epochs_start_time = epochs_end_time precision, recall = self._evaluate_sample(sess, train_output, feed_dict, batch_examples, full_log=True) if precision and recall: pr_string = "\tPrecision: {:.3f}\tRecall {:.3f}".format( np.mean(precision), np.mean(recall)) else: pr_string = "" tf.logging.info( ("[{}] Epoch: {}.{}\tLoss: {:.3f}|{:.3f}|{:.3f}\t" + "Accuracy: {:.3f}{}\n").format( epochs_time_str, epoch, batch, train_output["loss"], train_output["props"], train_output["regularization"], train_output["accuracy"], pr_string)) # Do a dev run, it doesn't take that long self.evaluate(sess, full=False) training_end_time = time.time() tf.logging.info("Training took: %s" % str(datetime.timedelta( seconds=training_end_time - training_start_time))) if self.ckpt_dir is not None: save_path = self.saver.save(sess, os.path.join(self.ckpt_dir, "model.ckpt")) tf.logging.info("Saved model at {}".format(save_path))
def _body(i, posterior, center, wx, activation_biases, sigma_biases, input_activation, tile_filter): """Body of EM while loop.""" tf.logging.info(' Wx: %s', wx) beta = final_beta * (1 - tf.pow(0.95, tf.cast(i + 1, tf.float32))) posterior = tf.Print(posterior, [ layer_name, i, h, ih, tf.reduce_min(posterior), tf.reduce_max(posterior) ], message='posterior') # route: [outdim, height?, width?, batch, indim] with tf.name_scope('vote_conf'): vote_conf = posterior * input_activation vote_conf = tf.maximum(vote_conf, 0.0) # masses: [batch, 1, outdim, 1, height, width, 1, 1] with tf.name_scope('masses'): masses = tf.reduce_sum(vote_conf, axis=[1, -1, -2], keepdims=True, name='masses_calculation') + 0.0000001 with tf.name_scope('preactivate_unrolled'): preactivate_unrolled = vote_conf * wx # center: [batch, 1, outdim, outatom, height, width] with tf.name_scope('center'): center = .9 * tf.reduce_sum( preactivate_unrolled, axis=[1, -1, -2], keepdims=True) / masses + .1 * center # Rematerialization to save GPU memory. (+22ms/-1.6GB) # @tf.contrib.layers.recompute_grad def compute_noise_and_variance(wx, center, vote_conf, masses): noise = tf.squared_difference(wx, center) variance = min_var + tf.reduce_sum( vote_conf * noise, axis=[1, -1, -2], keepdims=True, name='variance_calculation') / masses return noise, variance with tf.name_scope('compute_noise_and_variance'): noise, variance = compute_noise_and_variance( wx, center, vote_conf, masses) with tf.name_scope('win'): log_variance = tf.log(variance) p_i = -1 * tf.reduce_sum(log_variance, axis=3, keepdims=True) log_2pi = tf.log(2 * math.pi) sigma_b = tf.log(sigma_biases * sigma_biases + min_var) win = masses * (p_i - num_out_atoms * (sigma_b + log_2pi + 1.0)) with tf.name_scope('logit'): logit = beta * (win - activation_biases * 50 * num_out_atoms) with tf.name_scope('activation_update'): activation_update = tf.minimum( 0.0, logit) - tf.log(1 + tf.exp(-tf.abs(logit))) with tf.name_scope('sigma_update'): log_det_sigma = -1 * p_i sigma_update = (num_out_atoms * log_2pi + log_det_sigma) / 2.0 with tf.name_scope('exp_update'): exp_update = tf.reduce_sum(noise / (2 * variance), axis=3, keep_dims=True) prior_update = tf.subtract(activation_update - sigma_update, exp_update, name='prior_update_sub') max_prior_update = tf.reduce_max(prior_update, axis=[2, 3, 4, 5, 6, 7], keepdims=True, name='max_prior_opdate') prior_normal = tf.add(prior_update, -1 * max_prior_update) prior_exp = tf.exp(prior_normal) prior_exp_out = tf.reduce_sum(prior_exp, axis=2, keepdims=True, name='prior_exp_out') prior_exp_reshape = tf.reshape(prior_exp_out, [-1, h, h, k * k], name='prior_exp_reshape') sum_prior = tf.nn.conv2d_transpose(prior_exp_reshape, tile_filter, output_shape=[b * c, ih, ih, 1], strides=[1, s, s, 1], padding='VALID') sum_prior = tf.maximum(1e-6, sum_prior) sum_prior_patch = utils.kernel_tile(sum_prior, k, s, 1, name='sum_prior_patch') with utils.maybe_jit_scope(), tf.name_scope('posterior'): sum_prior_reshape = tf.reshape( sum_prior_patch, [-1, input_dim, 1, 1, h, h, k, k]) posterior = prior_exp / sum_prior_reshape return (i + 1, posterior, logit, center, masses)
def call(self, x): input_image, y_pred, y_true, true_boxes = x # adjust the shape of the y_predict [batch, grid_h, grid_w, 3, 4+1+nb_class] y_pred = tf.reshape( y_pred, tf.concat([tf.shape(input=y_pred)[:3], tf.constant([3, -1])], axis=0)) # initialize the masks object_mask = tf.expand_dims(y_true[..., 4], 4) # the variable to keep track of number of batches processed batch_seen = tf.Variable(0.) # compute grid factor and net factor grid_h = tf.shape(input=y_true)[1] grid_w = tf.shape(input=y_true)[2] grid_factor = tf.reshape(tf.cast([grid_w, grid_h], tf.float32), [1, 1, 1, 1, 2]) net_h = tf.shape(input=input_image)[1] net_w = tf.shape(input=input_image)[2] net_factor = tf.reshape(tf.cast([net_w, net_h], tf.float32), [1, 1, 1, 1, 2]) """ Adjust prediction """ pred_box_xy = (self.cell_grid[:, :grid_h, :grid_w, :, :] + tf.sigmoid(y_pred[..., :2])) # sigma(t_xy) + c_xy pred_box_wh = y_pred[..., 2:4] # t_wh pred_box_conf = tf.expand_dims(tf.sigmoid(y_pred[..., 4]), 4) # adjust confidence pred_box_class = y_pred[..., 5:] # adjust class probabilities """ Adjust ground truth """ true_box_xy = y_true[..., 0:2] # (sigma(t_xy) + c_xy) true_box_wh = y_true[..., 2:4] # t_wh true_box_conf = tf.expand_dims(y_true[..., 4], 4) true_box_class = tf.argmax(input=y_true[..., 5:], axis=-1) """ Compare each predicted box to all true boxes """ # initially, drag all objectness of all boxes to 0 conf_delta = pred_box_conf - 0 # then, ignore the boxes which have good overlap with some true box true_xy = true_boxes[..., 0:2] / grid_factor true_wh = true_boxes[..., 2:4] / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy / grid_factor, 4) pred_wh = tf.expand_dims( tf.exp(pred_box_wh) * self.anchors / net_factor, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(input_tensor=iou_scores, axis=4) conf_delta *= tf.expand_dims( tf.cast(best_ious < self.ignore_thresh, dtype=tf.float32), 4) """ Compute some online statistics """ true_xy = true_box_xy / grid_factor true_wh = tf.exp(true_box_wh) * self.anchors / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = pred_box_xy / grid_factor pred_wh = tf.exp(pred_box_wh) * self.anchors / net_factor pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) iou_scores = object_mask * tf.expand_dims(iou_scores, 4) count = tf.reduce_sum(input_tensor=object_mask) count_noobj = tf.reduce_sum(input_tensor=1 - object_mask) detect_mask = tf.cast((pred_box_conf * object_mask) >= 0.5, dtype=tf.float32) class_mask = tf.expand_dims( tf.cast(tf.equal(tf.argmax(input=pred_box_class, axis=-1), true_box_class), dtype=tf.float32), 4) recall50 = tf.reduce_sum( input_tensor=tf.cast(iou_scores >= 0.5, dtype=tf.float32) * detect_mask * class_mask) / (count + 1e-3) recall75 = tf.reduce_sum( input_tensor=tf.cast(iou_scores >= 0.75, dtype=tf.float32) * detect_mask * class_mask) / (count + 1e-3) avg_iou = tf.reduce_sum(input_tensor=iou_scores) / (count + 1e-3) avg_obj = tf.reduce_sum(input_tensor=pred_box_conf * object_mask) / (count + 1e-3) avg_noobj = tf.reduce_sum(input_tensor=pred_box_conf * (1 - object_mask)) / (count_noobj + 1e-3) avg_cat = tf.reduce_sum(input_tensor=object_mask * class_mask) / (count + 1e-3) """ Warm-up training """ batch_seen = tf.assign_add(batch_seen, 1.) true_box_xy, true_box_wh, xywh_mask = tf.cond( pred=tf.less(batch_seen, self.warmup_batches + 1), true_fn=lambda: [ true_box_xy + (0.5 + self.cell_grid[:, :grid_h, :grid_w, :, :]) * (1 - object_mask), true_box_wh + tf.zeros_like(true_box_wh) * (1 - object_mask), tf.ones_like(object_mask) ], false_fn=lambda: [true_box_xy, true_box_wh, object_mask]) """ Compare each true box to all anchor boxes """ wh_scale = tf.exp(true_box_wh) * self.anchors / net_factor wh_scale = tf.expand_dims( 2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale xy_delta = xywh_mask * (pred_box_xy - true_box_xy) * wh_scale * self.xywh_scale wh_delta = xywh_mask * (pred_box_wh - true_box_wh) * wh_scale * self.xywh_scale conf_delta = object_mask * ( pred_box_conf - true_box_conf) * self.obj_scale + ( 1 - object_mask) * conf_delta * self.noobj_scale class_delta = object_mask * \ tf.expand_dims(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class), 4) * \ self.class_scale loss_xy = tf.reduce_sum(input_tensor=tf.square(xy_delta), axis=list(range(1, 5))) loss_wh = tf.reduce_sum(input_tensor=tf.square(wh_delta), axis=list(range(1, 5))) loss_conf = tf.reduce_sum(input_tensor=tf.square(conf_delta), axis=list(range(1, 5))) loss_class = tf.reduce_sum(input_tensor=class_delta, axis=list(range(1, 5))) loss = loss_xy + loss_wh + loss_conf + loss_class loss = tf.Print(loss, [grid_h, avg_obj], message='avg_obj \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_noobj], message='avg_noobj \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_iou], message='avg_iou \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_cat], message='avg_cat \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, recall50], message='recall50 \t', summarize=1000) loss = tf.Print(loss, [grid_h, recall75], message='recall75 \t', summarize=1000) loss = tf.Print(loss, [grid_h, count], message='count \t', summarize=1000) loss = tf.Print(loss, [ grid_h, tf.reduce_sum(input_tensor=loss_xy), tf.reduce_sum(input_tensor=loss_wh), tf.reduce_sum(input_tensor=loss_conf), tf.reduce_sum(input_tensor=loss_class) ], message='loss xy, wh, conf, class: \t', summarize=1000) return loss * self.grid_scale
def main(unused_argv): FLAGS.comb_dropout_keep_prob = 1.0 FLAGS.image_keep_prob = 1.0 FLAGS.elements_keep_prob = 1.0 # Get dataset-dependent information. tf.gfile.MakeDirs(FLAGS.eval_logdir) tf.logging.info('Evaluating on %s set', FLAGS.split) with tf.Graph().as_default(): samples = model_input.get_input_fn(FLAGS)() # Get model segmentation predictions. num_classes = model_input.dataset_descriptors[ FLAGS.dataset].num_classes output_to_num_classes = model.get_output_to_num_classes(FLAGS) if tuple(FLAGS.eval_scales) == (1.0, ): tf.logging.info('Performing single-scale test.') predictions, probs = model.predict_labels( samples['image'], samples, FLAGS, outputs_to_num_classes=output_to_num_classes, image_pyramid=FLAGS.image_pyramid, merge_method=FLAGS.merge_method, atrous_rates=FLAGS.atrous_rates, add_image_level_feature=FLAGS.add_image_level_feature, aspp_with_batch_norm=FLAGS.aspp_with_batch_norm, aspp_with_separable_conv=FLAGS.aspp_with_separable_conv, multi_grid=FLAGS.multi_grid, depth_multiplier=FLAGS.depth_multiplier, output_stride=FLAGS.output_stride, decoder_output_stride=FLAGS.decoder_output_stride, decoder_use_separable_conv=FLAGS.decoder_use_separable_conv, crop_size=[FLAGS.image_size, FLAGS.image_size], logits_kernel_size=FLAGS.logits_kernel_size, model_variant=FLAGS.model_variant) else: tf.logging.info('Performing multi-scale test.') predictions, probs = model.predict_labels_multi_scale( samples['image'], samples, FLAGS, outputs_to_num_classes=output_to_num_classes, eval_scales=FLAGS.eval_scales, add_flipped_images=FLAGS.add_flipped_images, merge_method=FLAGS.merge_method, atrous_rates=FLAGS.atrous_rates, add_image_level_feature=FLAGS.add_image_level_feature, aspp_with_batch_norm=FLAGS.aspp_with_batch_norm, aspp_with_separable_conv=FLAGS.aspp_with_separable_conv, multi_grid=FLAGS.multi_grid, depth_multiplier=FLAGS.depth_multiplier, output_stride=FLAGS.output_stride, decoder_output_stride=FLAGS.decoder_output_stride, decoder_use_separable_conv=FLAGS.decoder_use_separable_conv, crop_size=[FLAGS.image_size, FLAGS.image_size], logits_kernel_size=FLAGS.logits_kernel_size, model_variant=FLAGS.model_variant) metric_map = {} for output in output_to_num_classes: output_predictions = predictions[output] output_probs = probs[output] if output == 'segment': output_predictions = tf.expand_dims(output_predictions, 3) if num_classes == 2: labels = samples['label'] iou, weights = model.foreground_iou( labels, output_predictions, FLAGS) soft_iou, _ = model.foreground_iou( labels, output_probs[:, :, :, 1:2], FLAGS) metric_map['mIOU'] = tf.metrics.mean(iou) metric_map['soft_mIOU'] = tf.metrics.mean(soft_iou) high_prob_overlaps = calc_high_prob_overlaps( labels, output_probs, weights) metric_map['highestOverlaps'] = tf.metrics.mean( high_prob_overlaps) output_probs *= weights else: output_predictions = tf.reshape(output_predictions, shape=[-1]) labels = tf.reshape(samples['label'], shape=[-1]) weights = tf.to_float( tf.not_equal( labels, model_input.dataset_descriptors[ FLAGS.dataset].ignore_label)) # Set ignore_label regions to label 0, because metrics.mean_iou # requires range of labels=[0, dataset.num_classes). # Note the ignore_label regions are not evaluated since # the corresponding regions contain weights=0. labels = tf.where( tf.equal( labels, model_input.dataset_descriptors[ FLAGS.dataset].ignore_label), tf.zeros_like(labels), labels) predictions_tag = 'mIOU' for eval_scale in FLAGS.eval_scales: predictions_tag += '_' + str(eval_scale) if FLAGS.add_flipped_images: predictions_tag += '_flipped' # Define the evaluation metric. metric_map[ predictions_tag] = contrib_slim.metrics.mean_iou( output_predictions, labels, num_classes, weights=weights) def label_summary(labels, weights, name): tf.summary.image( name, tf.reshape( tf.cast( tf.to_float(labels * 255) / tf.to_float(num_classes), tf.uint8) * tf.cast(weights, tf.uint8), [-1, FLAGS.image_size, FLAGS.image_size, 1]), 8) label_summary(labels, weights, 'label') label_summary(output_predictions, weights, 'output_predictions') tf.summary.image('logits', tf.expand_dims(output_probs[:, :, :, 1], 3)) elif output == 'regression': labels = samples['label'] ignore_mask = model.get_ignore_mask(labels, FLAGS) accurate = calc_accuracy_in_box(labels, output_probs, ignore_mask) metric_map['inBoxAccuracy'] = tf.metrics.mean(accurate) tf.summary.image('image', samples['image'], 8) metrics_to_values, metrics_to_updates = contrib_slim.metrics.aggregate_metric_map( metric_map) for metric_name, metric_value in metrics_to_values.iteritems(): metric_value = tf.Print(metric_value, [metric_value], metric_name) tf.summary.scalar(metric_name, metric_value) num_batches = int( math.ceil(FLAGS.num_samples / float(FLAGS.batch_size))) tf.logging.info('Eval num images %d', FLAGS.num_samples) tf.logging.info('Eval batch size %d and num batch %d', FLAGS.batch_size, num_batches) contrib_slim.evaluation.evaluation_loop( master='', checkpoint_dir=FLAGS.checkpoint_dir, logdir=FLAGS.eval_logdir, num_evals=num_batches, eval_op=metrics_to_updates.values(), summary_op=tf.summary.merge_all(), max_number_of_evaluations=None, eval_interval_secs=FLAGS.eval_interval_secs)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) dataset = data_generator.Dataset( dataset_name=FLAGS.dataset, split_name=FLAGS.eval_split, dataset_dir=FLAGS.dataset_dir, batch_size=FLAGS.eval_batch_size, crop_size=[int(sz) for sz in FLAGS.eval_crop_size], min_resize_value=FLAGS.min_resize_value, max_resize_value=FLAGS.max_resize_value, resize_factor=FLAGS.resize_factor, model_variant=FLAGS.model_variant, num_readers=2, is_training=False, should_shuffle=False, should_repeat=False, with_cls=True, cls_only=False, output_valid=True) tf.gfile.MakeDirs(FLAGS.eval_logdir) tf.logging.info('Evaluating on %s set', FLAGS.eval_split) with tf.Graph().as_default(): samples = dataset.get_one_shot_iterator().get_next() model_options = common.ModelOptions( outputs_to_num_classes={ common.OUTPUT_TYPE: dataset.num_of_classes }, crop_size=[int(sz) for sz in FLAGS.eval_crop_size], atrous_rates=FLAGS.atrous_rates, output_stride=FLAGS.output_stride) # Set shape in order for tf.contrib.tfprof.model_analyzer to work properly. samples[common.IMAGE].set_shape([ FLAGS.eval_batch_size, int(FLAGS.eval_crop_size[0]), int(FLAGS.eval_crop_size[1]), 3 ]) if tuple(FLAGS.eval_scales) == (1.0, ): tf.logging.info('Performing single-scale test.') predictions = model.predict_labels( samples[common.IMAGE], model_options, image_pyramid=FLAGS.image_pyramid) else: tf.logging.info('Performing multi-scale test.') raise NotImplementedError('Multi-scale is not supported yet!') metric_map = {} ## Extract cls logits if FLAGS.weakly: _, end_points = feature_extractor.extract_features( samples[common.IMAGE], output_stride=model_options.output_stride, multi_grid=model_options.multi_grid, model_variant=model_options.model_variant, depth_multiplier=model_options.depth_multiplier, divisible_by=model_options.divisible_by, reuse=tf.AUTO_REUSE, is_training=False, preprocessed_images_dtype=model_options. preprocessed_images_dtype, global_pool=True, num_classes=dataset.num_of_classes - 1) # ResNet beta version has an additional suffix in FLAGS.model_variant, but # it shares the same variable names with original version. Add a special # handling here for beta version ResNet. logits = end_points['{}/logits'.format( FLAGS.model_variant).replace('_beta', '')] logits = tf.reshape(logits, [-1, dataset.num_of_classes - 1]) cls_pred = tf.sigmoid(logits) # Multi-label classification evaluation cls_label = samples['cls_label'] cls_pred = tf.cast(tf.greater_equal(cls_pred, 0.5), tf.int32) ## For classification metric_map['eval/cls_overall'] = tf.metrics.accuracy( labels=cls_label, predictions=cls_pred) metric_map['eval/cls_precision'] = tf.metrics.precision( labels=cls_label, predictions=cls_pred) metric_map['eval/cls_recall'] = tf.metrics.recall( labels=cls_label, predictions=cls_pred) ## For segmentation branch eval predictions = predictions[common.OUTPUT_TYPE] predictions = tf.reshape(predictions, shape=[-1]) labels = tf.reshape(samples[common.LABEL], shape=[-1]) weights = tf.to_float(tf.not_equal(labels, dataset.ignore_label)) # Set ignore_label regions to label 0, because metrics.mean_iou requires # range of labels = [0, dataset.num_classes). Note the ignore_label regions # are not evaluated since the corresponding regions contain weights = 0. labels = tf.where(tf.equal(labels, dataset.ignore_label), tf.zeros_like(labels), labels) predictions_tag = 'miou' # Define the evaluation metric. num_classes = dataset.num_of_classes ## For segmentation metric_map['eval/%s_overall' % predictions_tag] = tf.metrics.mean_iou( labels=labels, predictions=predictions, num_classes=num_classes, weights=weights) # IoU for each class. one_hot_predictions = tf.one_hot(predictions, num_classes) one_hot_predictions = tf.reshape(one_hot_predictions, [-1, num_classes]) one_hot_labels = tf.one_hot(labels, num_classes) one_hot_labels = tf.reshape(one_hot_labels, [-1, num_classes]) for c in range(num_classes): predictions_tag_c = '%s_class_%d' % (predictions_tag, c) tp, tp_op = tf.metrics.true_positives( labels=one_hot_labels[:, c], predictions=one_hot_predictions[:, c], weights=weights) fp, fp_op = tf.metrics.false_positives( labels=one_hot_labels[:, c], predictions=one_hot_predictions[:, c], weights=weights) fn, fn_op = tf.metrics.false_negatives( labels=one_hot_labels[:, c], predictions=one_hot_predictions[:, c], weights=weights) tp_fp_fn_op = tf.group(tp_op, fp_op, fn_op) iou = tf.where(tf.greater(tp + fn, 0.0), tp / (tp + fn + fp), tf.constant(np.NaN)) metric_map['eval/%s' % predictions_tag_c] = (iou, tp_fp_fn_op) (metrics_to_values, metrics_to_updates) = contrib_metrics.aggregate_metric_map(metric_map) summary_ops = [] for metric_name, metric_value in six.iteritems(metrics_to_values): op = tf.summary.scalar(metric_name, metric_value) op = tf.Print(op, [metric_value], metric_name) summary_ops.append(op) summary_op = tf.summary.merge(summary_ops) summary_hook = contrib_training.SummaryAtEndHook( log_dir=FLAGS.eval_logdir, summary_op=summary_op) hooks = [summary_hook] num_eval_iters = None if FLAGS.max_number_of_evaluations > 0: num_eval_iters = FLAGS.max_number_of_evaluations if FLAGS.quantize_delay_step >= 0: contrib_quantize.create_eval_graph() contrib_tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=contrib_tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) contrib_tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=contrib_tfprof.model_analyzer.FLOAT_OPS_OPTIONS) contrib_training.evaluate_repeatedly( checkpoint_dir=FLAGS.checkpoint_dir, master=FLAGS.master, eval_ops=list(metrics_to_updates.values()), max_number_of_evaluations=num_eval_iters, hooks=hooks, eval_interval_secs=FLAGS.eval_interval_secs)
def my_model_fn(features, labels, mode, params=None, config=None): """Estimator model function. Args: features: dictionary where keys are strings like "inputs" and "targets" and the values are the actual values of "inputs". See TPUEstimator's docs for more information labels: ignored argument mode: a tf.estimator.ModeKeys params: dictionary containing the key "context" config: ignored argument Returns: a TPUEstimatorSpec """ del labels, config global_step = tf.train.get_global_step() if use_tpu and "context" in params: ctx = params["context"] num_hosts = ctx.num_hosts host_placement_fn = ctx.tpu_host_placement_function device_list = [ host_placement_fn(host_id=t) for t in range(num_hosts) ] # TODO(ylc): Better estimation of replica cache size? replica_cache_size = 300 * 1000000 # 300M per replica # Worker 0 caches all the TPU binaries. worker0_mem = replica_cache_size * ctx.num_replicas devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1) var_placer = mtf.utils.BalancedVariablePlacer( device_list, devices_memeory_usage) # deprecated mesh_devices = [""] * mesh_shape.size physical_shape = list( params["context"].device_assignment.topology.mesh_shape) logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu( mesh_shape.to_integer_list, physical_shape) mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl( mesh_shape, layout_rules, mesh_devices, ctx.device_assignment, logical_to_physical=logical_to_physical) else: var_placer = None # deprecated mesh_devices = [""] * mesh_shape.size mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, mesh_devices) graph = mtf.Graph() mesh = mtf.Mesh(graph, "my_mesh", var_placer) mtf_features = {} for key, x in features.items(): outer_batch_dim = mtf.Dimension("outer_batch", outer_batch_size) batch_dim = mtf.Dimension("batch", batch_size // outer_batch_size) # Some auxiliary features may have been generated in packing. # The names of these new features are of the form # "<original_feature_name>_<suffix>", e.g. "inputs_segmentation". # We look up the lengths based on the original feature name, without # the "_<suffix>". feature_length = sequence_length[key.split("_")[0]] length_dim = mtf.Dimension("length", feature_length) ensemble_dims = ([mtf.Dimension("ensemble", ensemble_inputs)] if ensemble_inputs else []) feature_shape = mtf.Shape(ensemble_dims + [outer_batch_dim, batch_dim, length_dim]) x = tf.cast(features[key], tf.int32) x = tf.reshape(x, feature_shape.to_integer_list) if not use_tpu: tf.logging.info("feature %s : %s" % (key, x)) x = tf.Print(x, [x], "import feature %s" % key, summarize=1000, first_n=10) mtf_features[key] = mtf.import_fully_replicated(mesh, x, feature_shape, name=key) if key == "targets" or key == "codeprefixedtargets" or key == "controlcode": anon_targets = mtf.anonymize(mtf_features[key]) if mode == tf.estimator.ModeKeys.PREDICT: def _feature_shape(key): feature_length = sequence_length[key.split("_")[0]] return mtf.Shape([ mtf.Dimension("batch", batch_size), mtf.Dimension("length", feature_length) ]) mtf_features = { k: mtf.reshape(v, _feature_shape(k)) for k, v in six.iteritems(mtf_features) } inputs = mtf_features["inputs"] if attribute_embedding: attributes = mtf_features["attribute"] else: attributes = None if has_partial_sequences: controlcodes = mtf_features["controlcode"] else: controlcodes = None if predict_fn: mtf_samples = predict_fn(model=transformer_model, features=mtf_features, variable_dtype=get_variable_dtype()) elif isinstance(transformer_model, transformer.Unitransformer): # pad so that there is enough room for the targets inputs = mtf.pad(inputs, [0, sequence_length["targets"]], length_dim.name) mtf_samples = transformer_model.sample_autoregressive( inputs, variable_dtype=get_variable_dtype(), remove_partial_sequences=True) elif isinstance(transformer_model, Bitransformer_ll): mtf_samples = transformer_model.decode( inputs, attributes=attributes, controlcodes=controlcodes, has_partial_sequences=has_partial_sequences, remove_partial_sequences=remove_partial_sequences, variable_dtype=get_variable_dtype()) # elif isinstance( transformer_model, (transformer.Bitransformer, transformer.StudentTeacher)): mtf_samples = transformer_model.decode( inputs, variable_dtype=get_variable_dtype()) else: raise ValueError("unrecognized class") mtf_samples = mtf.anonymize(mtf_samples) inputs = mtf.anonymize(inputs) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=autostack) inputs = lowering.export_to_tf_tensor(inputs) outputs = lowering.export_to_tf_tensor(mtf_samples) predictions = {"inputs": inputs, "outputs": outputs} # When exporting a model, we need to communicate to TF-Serving that # master variables need to be copied to their slave slice variables. # Estimator uses a Scaffold's "local_init_op" for this purpose, so we # augment the default "local_init_op" here. # # The "ready_op" is also constructed here to ensure the variables # initialized by "local_init_op" are the same ones checked by "ready_op". # # WARNING: Any variables created outside of this model_fn() # (e.g. tpu_estimator/iterations_per_loop) will NOT be initialized nor # checked by these ops. def scaffold_fn(): return tf.train.Scaffold( local_init_op=tf.group( tf.train.Scaffold.default_local_init_op(), lowering.copy_masters_to_slices(), name="mtf_local_init_op"), ready_op=tf.concat([ tf.report_uninitialized_variables(), resources.report_uninitialized_resources() ], axis=0, name="mtf_ready_op")) return tpu_estimator.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions, scaffold_fn=scaffold_fn, prediction_hooks=[mtf.MtfRestoreHook(lowering)]) assert (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL) def logits_and_loss(mtf_features): """Compute logits and loss. Args: mtf_features: a dictionary Returns: logits: a mtf.Tensor loss: a mtf.Tensor """ if model_type == "lm": # TOTRY Adapt that to our case if "inputs" in mtf_features: mtf_features = _dynamic_text2self(mtf_features) _, _, length_dim = mtf_features["targets"].shape inputs = mtf.shift(mtf_features["targets"], offset=1, dim=length_dim, wrap=False) else: inputs = mtf_features["inputs"] if attribute_embedding: attributes = mtf_features["attribute"] else: attributes = None if control_codes: codeprefixedtargets = mtf_features["codeprefixedtargets"] else: codeprefixedtargets = None if isinstance(transformer_model, transformer.Unitransformer): position_kwargs = dict( sequence_id=mtf_features.get("targets_segmentation", None), position=mtf_features.get("targets_position", None), ) elif isinstance(transformer_model, transformer.Bitransformer ) or model_type == "bi_student_teacher": if control_codes: position_kwargs = dict( encoder_sequence_id=mtf_features.get( "inputs_segmentation", None), decoder_sequence_id=mtf_features.get( "codeprefixedtargets_segmentation", None), decoder_subsequence_id=mtf_features.get( "codeprefixedtargets_subsegmentation", None), encoder_position=mtf_features.get( "inputs_position", None), decoder_position=mtf_features.get( "codeprefixedtargets_position", None), ) else: position_kwargs = dict( encoder_sequence_id=mtf_features.get( "inputs_segmentation", None), decoder_sequence_id=mtf_features.get( "targets_segmentation", None), decoder_subsequence_id=mtf_features.get( "targets_subsegmentation", None), encoder_position=mtf_features.get( "inputs_position", None), decoder_position=mtf_features.get( "targets_position", None), ) else: raise ValueError("unrecognized class") if isinstance(transformer_model, Bitransformer_ll): if cycle_consistency_loss: logits_ae, l_ae = transformer_model.call_simple( inputs=inputs, targets=mtf_features["targets"], compute_loss=True, attributes=attributes, codeprefixedtargets=codeprefixedtargets, mode=mode, variable_dtype=get_variable_dtype(), **position_kwargs) if has_partial_sequences: controlcodes = mtf_features["controlcode"] else: controlcodes = None with gin.config_scope('training'): mtf_samples = transformer_model.decode( inputs, attributes=attributes, controlcodes=controlcodes, has_partial_sequences=has_partial_sequences, remove_partial_sequences=remove_partial_sequences, variable_dtype=get_variable_dtype()) # mtf_samples = mtf.anonymize(mtf_samples) outputs = mtf_samples logits_cycle, l_cycle = transformer_model.call_simple( inputs=outputs, targets=mtf_features["targets"], compute_loss=True, attributes=attributes, codeprefixedtargets=codeprefixedtargets, mode=mode, variable_dtype=get_variable_dtype(), **position_kwargs) loss_ae_cycle = lambda_ae * l_ae + lambda_cycle * l_cycle return logits_cycle, loss_ae_cycle else: return transformer_model.call_simple( inputs=inputs, targets=mtf_features["targets"], compute_loss=True, attributes=attributes, codeprefixedtargets=codeprefixedtargets, mode=mode, variable_dtype=get_variable_dtype(), **position_kwargs) else: return transformer_model.call_simple( inputs=inputs, targets=mtf_features["targets"], compute_loss=True, mode=mode, variable_dtype=get_variable_dtype(), num_microbatches=num_microbatches, **position_kwargs) if mode == tf.estimator.ModeKeys.TRAIN: num_microbatches = serialize_num_microbatches( batch_dim, sequence_length, mesh_shape, layout_rules) if num_microbatches > 1: def serialized_fn(mtf_features): return { "loss": (logits_and_loss(mtf_features)[1] / num_microbatches) } var_grads, loss_dict = mtf.serialize_training_step( mtf_features, serialized_fn, batch_dim, num_microbatches) loss = loss_dict["loss"] else: loss = logits_and_loss(mtf_features)[1] var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) if tpu_summaries: mtf.scalar_summary("loss", loss) if callable(learning_rate_schedule): # the following happens on CPU since TPU can't handle summaries. with mtf.utils.outside_all_rewrites(): learning_rate = learning_rate_schedule( step=tf.train.get_global_step()) tf.summary.scalar("learning_rate", learning_rate) else: learning_rate = learning_rate_schedule if isinstance(variable_filter, str): pattern = re.compile(variable_filter) variable_filter_fn = lambda v: pattern.search(v.name) elif variable_filter is None: variable_filter_fn = lambda v: True elif callable(variable_filter): variable_filter_fn = variable_filter else: raise ValueError( "variable_filter must be None, a string, or a callable function" ) trainable_vars = [ v for v in graph.trainable_variables if variable_filter_fn(v) ] trainable_var_grads = [ g for g, v in zip(var_grads, graph.trainable_variables) if variable_filter_fn(v) ] if len(trainable_vars) != len(graph.trainable_variables): tf.logging.info("Variables being trained:") tf.logging.info([v.name for v in trainable_vars]) tf.logging.info("Variables not being trained:") tf.logging.info([ v.name for v in graph.trainable_variables if not variable_filter_fn(v) ]) update_ops = optimizer(learning_rate=learning_rate).apply_grads( trainable_var_grads, trainable_vars) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=autostack) tf_loss = lowering.export_to_tf_tensor(loss) tf_loss = tf.cast(tf_loss, tf.float32) if not use_tpu: tf_loss = tf.Print( tf_loss, [tf_loss, tf.train.get_global_step()], "step, tf_loss") tf_update_ops = [ lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append(tf.assign_add(global_step, 1)) train_op = tf.group(tf_update_ops) if hasattr(transformer_model, "initialize"): with mtf.utils.outside_all_rewrites(): transformer_model.initialize() if tpu_summaries: # has to be outside of # with mtf.utils.outside_all_rewrites() host_call = mtf.utils.create_host_call(model_dir) mtf.utils.remove_summaries() else: host_call = None with mtf.utils.outside_all_rewrites(): if init_checkpoint: ckpt_vars = { v for v, _ in tf.train.list_variables(init_checkpoint) } global_vars = {v.op.name for v in tf.global_variables()} restore_vars = ckpt_vars.intersection(global_vars) tf.logging.info("Initializing variables from %s:", init_checkpoint) tf.logging.debug("\n".join(sorted(restore_vars))) tf.logging.info("Variables in %s but not in graph:", init_checkpoint) tf.logging.info("\n".join(sorted(ckpt_vars - global_vars))) tf.logging.info("Variables in graph but not in %s:", init_checkpoint) tf.logging.info("\n".join(sorted(global_vars - ckpt_vars))) tf.train.init_from_checkpoint(init_checkpoint, {v: v for v in restore_vars}) # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=keep_checkpoint_max, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( model_dir, save_steps=save_checkpoints_steps, saver=saver, listeners=[saver_listener]) gin_config_saver_hook = gin.tf.GinConfigSaverHook( model_dir, summarize_config=True, include_step_in_filename=False) if use_tpu: return tpu_estimator.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, host_call=host_call, training_hooks=[ restore_hook, saver_hook, gin_config_saver_hook, ]) else: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_chief_hooks=[ restore_hook, saver_hook, gin_config_saver_hook, ]) elif mode == tf.estimator.ModeKeys.EVAL: logits, loss = logits_and_loss(mtf_features) anon_logits = mtf.anonymize(logits) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=autostack) tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32) tf_loss = tf.cast(tf_loss, tf.float32) tf_logits = tf.cast(lowering.export_to_tf_tensor(anon_logits), tf.float32) def simple_metrics(logits, labels): """Simple metrics for teacher-forced eval.""" weights = tf.cast(tf.not_equal(labels, 0), tf.float32) xent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) predictions = tf.cast(tf.argmax(logits, axis=-1), labels.dtype) token_correct = tf.cast(tf.equal(predictions, labels), tf.float32) * weights sequence_correct = tf.to_float( tf.equal(tf.reduce_sum(token_correct, -1), tf.reduce_sum(weights, -1))) sequence_weights = tf.to_float( tf.not_equal(tf.reduce_sum(weights, -1), 0)) return { "neg_log_perplexity": tf.metrics.mean(-xent, weights), "token_accuracy": tf.metrics.mean(token_correct, weights), "sequence_accuracy": tf.metrics.mean(sequence_correct, sequence_weights) } labels = lowering.export_to_tf_tensor(anon_targets) eval_metrics = (simple_metrics, [tf_logits, labels]) with mtf.utils.outside_all_rewrites(): restore_hook = mtf.MtfRestoreHook(lowering) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): _ = slim.get_or_create_global_step( ) # Required when creating the session. ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ######################### # Configure the network # ######################### inception_params = network_params.InceptionV3FCNParams( receptive_field_size=FLAGS.receptive_field_size, prelogit_dropout_keep_prob=0.8, depth_multiplier=0.1, min_depth=16, inception_fcn_stride=0, ) conv_params = network_params.ConvScopeParams( dropout=False, dropout_keep_prob=0.8, batch_norm=True, batch_norm_decay=0.99, l2_weight_decay=4e-05, ) network_fn = inception_v3_fcn.get_inception_v3_fcn_network_fn( inception_params, conv_params, num_classes=dataset.num_classes, is_training=False, ) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) ##################################### # Select the preprocessing function # ##################################### image_preprocessing_fn = preprocessing_factory.get_preprocessing( 'inception_v3', is_training=False) eval_image_size = FLAGS.receptive_field_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch([image, label], batch_size=FLAGS.batch_size, num_threads=PREPROCESSING_THREADS, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_2': slim.metrics.streaming_recall_at_k(logits, labels, 2), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s', checkpoint_path) slim.evaluation.evaluate_once( master='', checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), session_config=tf.ConfigProto(allow_soft_placement=True), variables_to_restore=variables_to_restore)
def get_sent_reps_masks_normal_loop(sent_index, input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc, masked_lm_example_loss_doc, masked_lm_weights_doc, dual_encoder_config, is_training, train_mode, input_ids, input_mask, masked_lm_positions, masked_lm_ids, masked_lm_weights, use_one_hot_embeddings, debugging=False): """Get the sentence encodings, mask ids and masked word LM loss. Args: sent_index: The index of the current looped sentence. input_sent_reps_doc: The representations of all sentences in the doc learned by BERT. input_mask_doc_level: The document level input masks, which indicates whether a sentence is a real sentence or a padded sentence. masked_lm_loss_doc: The sum of all the masked word LM loss. masked_lm_example_loss_doc: The per example masked word LM loss. masked_lm_weights_doc: the weights of the maksed LM words. If the position is corresponding to a real masked word, it is 1.0; It is a padded mask, the weight is 0. dual_encoder_config: The config of the dual encoder. is_training: Whether it is in the training mode. train_mode: string. The train mode which can be finetune, joint_train, or pretrain. input_ids: The ids of the input tokens. input_mask: The mask of the input tokens. masked_lm_positions: The positions of the masked words in the language model training. masked_lm_ids: The ids of the masked words in LM model training. masked_lm_weights: The weights of the masked words in LM model training. use_one_hot_embeddings: Whether use one hot embedding. It should be true for the runs on TPUs. debugging: bool. Whether it is in the debugging mode. Returns: A list of tensors on the learned sentence representations and the masked word LM loss. """ # Collect token information for the current sentence. bert_config = modeling.BertConfig.from_json_file( dual_encoder_config.encoder_config.bert_config_file) max_sent_length_by_word = dual_encoder_config.encoder_config.max_sent_length_by_word sent_bert_trainable = dual_encoder_config.encoder_config.sent_bert_trainable max_predictions_per_seq = dual_encoder_config.encoder_config.max_predictions_per_seq sent_start = sent_index * max_sent_length_by_word input_ids_cur_sent = tf.slice(input_ids, [0, sent_start], [-1, max_sent_length_by_word]) # Output shape: [batch, max_sent_length_by_word]. input_mask_cur_sent = tf.slice(input_mask, [0, sent_start], [-1, max_sent_length_by_word]) # Output Shape: [batch]. input_mask_cur_sent_max = tf.reduce_max(input_mask_cur_sent, 1) # Output Shape: [loop_sent_number_per_doc, batch]. input_mask_doc_level.append(input_mask_cur_sent_max) if debugging: input_ids_cur_sent = tf.Print( input_ids_cur_sent, [input_ids_cur_sent, input_mask_cur_sent], message="input_ids_cur_sent in get_sent_reps_masks_lm_loss", summarize=20) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids_cur_sent, input_mask=input_mask_cur_sent, use_one_hot_embeddings=use_one_hot_embeddings, sent_bert_trainable=sent_bert_trainable) with tf.variable_scope("seq_rep_from_bert_sent_dense", reuse=tf.AUTO_REUSE): normalized_siamese_input_tensor = get_seq_rep_from_bert(model) input_sent_reps_doc.append(normalized_siamese_input_tensor) if (train_mode == constants.TRAIN_MODE_PRETRAIN or train_mode == constants.TRAIN_MODE_JOINT_TRAIN): # Collect masked token information for the current sentence. sent_mask_lm_token_start = sent_index * max_predictions_per_seq # Output shape: [batch, max_predictions_per_seq]. masked_lm_positions_cur_sent = tf.slice(masked_lm_positions, [0, sent_mask_lm_token_start], [-1, max_predictions_per_seq]) masked_lm_ids_cur_sent = tf.slice(masked_lm_ids, [0, sent_mask_lm_token_start], [-1, max_predictions_per_seq]) masked_lm_weights_cur_sent = tf.slice(masked_lm_weights, [0, sent_mask_lm_token_start], [-1, max_predictions_per_seq]) # Since in the processed data of smith model, the masked lm positions are # global indices started from the 1st token of the whole sequence, we need # to transform this global position to a local position for the current # sentence. The position index is started from 0. # Local_index = global_index mod max_sent_length_by_word. masked_lm_positions_cur_sent = tf.mod(masked_lm_positions_cur_sent, max_sent_length_by_word) # Shape of masked_lm_loss_cur_sent [1]. # Shape of masked_lm_example_loss_cur_sent is [batch, # max_predictions_per_seq]. (masked_lm_loss_cur_sent, masked_lm_example_loss_cur_sent, _) = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions_cur_sent, masked_lm_ids_cur_sent, masked_lm_weights_cur_sent) # Output Shape: [1]. masked_lm_loss_doc += masked_lm_loss_cur_sent # Output Shape: [loop_sent_number_per_doc, batch * max_predictions_per_seq]. masked_lm_example_loss_doc.append(masked_lm_example_loss_cur_sent) # Output Shape: [loop_sent_number_per_doc, batch, max_predictions_per_seq]. masked_lm_weights_doc.append(masked_lm_weights_cur_sent) return (input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc, masked_lm_example_loss_doc, masked_lm_weights_doc)
def setup(act_fun): channel_num = 3 if FLAGS.mnist_model: print("------------------Using MNIST model------------") model = MnistNet( num_channels=channel_num, num_filters=128, act_fun=act_fun) elif FLAGS.large_model: print("------------------Using ResNet32Large model------------") model = ResNet32Large( num_channels=channel_num, num_filters=128, train=True, act_fun=act_fun) elif FLAGS.larger_model: print("------------------Using ResNet32Larger model------------") model = ResNet32Larger( num_channels=channel_num, num_filters=128, act_fun=act_fun) elif FLAGS.wider_model: print("------------------Using ResNet32Wider model------------") model = ResNet32Wider( num_channels=channel_num, num_filters=192, act_fun=act_fun) else: print("------------------Using ResNet32 model------------") model = ResNet32( num_channels=channel_num, num_filters=128, act_fun=act_fun) batch_size = FLAGS.batch_size weights = [model.construct_weights('context_0')] Y = tf.placeholder(shape=(None), dtype=tf.int32) LABEL = None X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32) LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32) LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32) # Varibles to run in training X_SPLIT = tf.split(X, FLAGS.num_gpus) X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus) LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus) LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus) LABEL_SPLIT_INIT = list(LABEL_SPLIT) tower_grads = [] tower_gen_grads = [] x_mod_list = [] optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999) optimizer = hvd.DistributedOptimizer(optimizer) for j in range(FLAGS.num_gpus): if FLAGS.model_cclass: ind_batch_size = FLAGS.batch_size // FLAGS.num_gpus label_tensor = tf.Variable( tf.convert_to_tensor( np.reshape( np.tile(np.eye(10), (FLAGS.batch_size, 1, 1)), (FLAGS.batch_size * 10, 10)), dtype=tf.float32), trainable=False, dtype=tf.float32) x_split = tf.tile( tf.reshape( X_SPLIT[j], (ind_batch_size, 1, 32, 32, 3)), (1, 10, 1, 1, 1)) x_split = tf.reshape(x_split, (ind_batch_size * 10, 32, 32, 3)) energy_pos = model.forward( x_split, weights[0], label=label_tensor, stop_at_grad=False) energy_pos_full = tf.reshape(energy_pos, (ind_batch_size, 10)) energy_partition_est = tf.reduce_logsumexp( energy_pos_full, axis=1, keepdims=True) uniform = tf.random_uniform(tf.shape(energy_pos_full)) label_tensor = tf.argmax(-energy_pos_full - tf.log(-tf.log(uniform)) - energy_partition_est, axis=1) label = tf.one_hot(label_tensor, 10, dtype=tf.float32) label = tf.Print(label, [label_tensor, energy_pos_full]) LABEL_SPLIT[j] = label energy_pos = tf.concat(energy_pos, axis=0) else: energy_pos = [ model.forward( X_SPLIT[j], weights[0], label=LABEL_POS_SPLIT[j], stop_at_grad=False)] energy_pos = tf.concat(energy_pos, axis=0) print("Building graph...") x_mod = x_orig = X_NOISE_SPLIT[j] x_grads = [] energy_negs = [] loss_energys = [] energy_negs.extend([model.forward(tf.stop_gradient( x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)]) eps_begin = tf.zeros(1) steps = tf.constant(0) c = lambda i, x: tf.less(i, FLAGS.num_steps) def langevin_step(counter, x_mod): x_mod = x_mod + tf.random_normal(tf.shape(x_mod), mean=0.0, stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale) energy_noise = energy_start = tf.concat( [model.forward( x_mod, weights[0], label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True)], axis=0) x_grad, label_grad = tf.gradients( FLAGS.temperature * energy_noise, [x_mod, LABEL_SPLIT[j]]) energy_noise_old = energy_noise lr = FLAGS.step_lr if FLAGS.proj_norm != 0.0: if FLAGS.proj_norm_type == 'l2': x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm) elif FLAGS.proj_norm_type == 'li': x_grad = tf.clip_by_value( x_grad, -FLAGS.proj_norm, FLAGS.proj_norm) else: print("Other types of projection are not supported!!!") assert False # Clip gradient norm for now if FLAGS.hmc: # Step size should be tuned to get around 65% acceptance def energy(x): return FLAGS.temperature * \ model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True) x_last = hmc(x_mod, 15., 10, energy) else: x_last = x_mod - (lr) * x_grad x_mod = x_last x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale) counter = counter + 1 return counter, x_mod steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod)) energy_eval = model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True) x_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod])[0] x_grads.append(x_grad) energy_negs.append( model.forward( tf.stop_gradient(x_mod), weights[0], label=LABEL_SPLIT[j], stop_at_grad=False, reuse=True)) test_x_mod = x_mod temp = FLAGS.temperature energy_neg = energy_negs[-1] x_off = tf.reduce_mean( tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j])) loss_energy = model.forward( x_mod, weights[0], reuse=True, label=LABEL, stop_grad=True) print("Finished processing loop construction ...") target_vars = {} if FLAGS.cclass or FLAGS.model_cclass: label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0) label_prob = label_sum / tf.reduce_sum(label_sum) label_ent = -tf.reduce_sum(label_prob * tf.math.log(label_prob + 1e-7)) else: label_ent = tf.zeros(1) target_vars['label_ent'] = label_ent if FLAGS.train: if FLAGS.objective == 'logsumexp': pos_term = temp * energy_pos energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg)) coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced)) norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4 pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'cd': pos_loss = tf.reduce_mean(temp * energy_pos) neg_loss = -tf.reduce_mean(temp * energy_neg) loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss)) elif FLAGS.objective == 'softplus': loss_ml = FLAGS.ml_coeff * \ tf.nn.softplus(temp * (energy_pos - energy_neg)) loss_total = tf.reduce_mean(loss_ml) if not FLAGS.zero_kl: loss_total = loss_total + tf.reduce_mean(loss_energy) loss_total = loss_total + \ FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg)))) print("Started gradient computation...") gvs = optimizer.compute_gradients(loss_total) gvs = [(k, v) for (k, v) in gvs if k is not None] print("Applying gradients...") tower_grads.append(gvs) print("Finished applying gradients.") target_vars['loss_ml'] = loss_ml target_vars['total_loss'] = loss_total target_vars['loss_energy'] = loss_energy target_vars['weights'] = weights target_vars['gvs'] = gvs target_vars['X'] = X target_vars['Y'] = Y target_vars['LABEL'] = LABEL target_vars['LABEL_POS'] = LABEL_POS target_vars['X_NOISE'] = X_NOISE target_vars['energy_pos'] = energy_pos target_vars['energy_start'] = energy_negs[0] if len(x_grads) >= 1: target_vars['x_grad'] = x_grads[-1] target_vars['x_grad_first'] = x_grads[0] else: target_vars['x_grad'] = tf.zeros(1) target_vars['x_grad_first'] = tf.zeros(1) target_vars['x_mod'] = x_mod target_vars['x_off'] = x_off target_vars['temp'] = temp target_vars['energy_neg'] = energy_neg target_vars['test_x_mod'] = test_x_mod target_vars['eps_begin'] = eps_begin if FLAGS.train: grads = average_gradients(tower_grads) train_op = optimizer.apply_gradients(grads) target_vars['train_op'] = train_op config = tf.ConfigProto() if hvd.size() > 1: config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) saver = loader = tf.train.Saver(max_to_keep=30, keep_checkpoint_every_n_hours=6) total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print("Model has a total of {} parameters".format(total_parameters)) sess.run(tf.global_variables_initializer()) resume_itr = 0 if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0: model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter)) resume_itr = FLAGS.resume_iter # saver.restore(sess, model_file) optimistic_restore(sess, model_file) sess.run(hvd.broadcast_global_variables(0)) return target_vars, saver, sess, resume_itr
def my_fn(x): return {k: tf.Print(v, [v], k + ": ") for k, v in x.items()}
def model_fn(features, labels, mode, params): """Mobilenet v1 model using Estimator API.""" num_classes = params['num_classes'] training_active = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = supervised_images.tensor_transform_fn( features, params['input_perm']) model = tf.keras.applications.MobileNet( input_tensor=features, include_top=True, weights=None, classes=num_classes) logits = model(features, training=training_active) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not params['use_tpu']): with tf.control_dependencies([ tf.Print( predictions['classes'], [predictions['classes']], summarize=params['eval_batch_size'], message='prediction: ') ]): labels = tf.Print( labels, [labels], summarize=params['eval_batch_size'], message='label: ') one_hot_labels = tf.one_hot(labels, params['num_classes'], dtype=tf.int32) tf.losses.softmax_cross_entropy( onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) loss = tf.losses.get_total_loss(add_regularization_losses=True) initial_learning_rate = params['learning_rate'] * params['train_batch_size'] / 256 # pylint: disable=line-too-long final_learning_rate = 0.0001 * initial_learning_rate train_op = None if training_active: batches_per_epoch = params['num_train_images'] // params['train_batch_size'] global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=params['learning_rate_decay_epochs'] * batches_per_epoch, decay_rate=params['learning_rate_decay'], staircase=True) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum( learning_rate, final_learning_rate, name='learning_rate') if params['optimizer'] == 'sgd': absl.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif params['optimizer'] == 'momentum': absl.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) elif params['optimizer'] == 'RMS': absl.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer( learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: absl.logging.fatal('Unknown optimizer:', params['optimizer']) if params['use_tpu']: optimizer = tf.tpu.CrossShardOptimizer(optimizer) update_ops = model.updates with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if params['moving_average']: ema = tf.train.ExponentialMovingAverage( decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) eval_metrics = None if eval_active: def metric_fn(labels, predictions): accuracy = tf.metrics.accuracy(labels, tf.argmax( input=predictions, axis=1)) return {'accuracy': accuracy} if params['use_logits']: eval_predictions = logits eval_metrics = (metric_fn, [labels, eval_predictions]) return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)