def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        is_real_example, label_ids = None, None
        if FLAGS.export_dir is None:
            label_ids = features["label_ids"]
            if "is_real_example" in features:
                is_real_example = tf.cast(features["is_real_example"],
                                          dtype=tf.float32)
            else:
                is_real_example = tf.ones(tf.shape(label_ids),
                                          dtype=tf.float32)

        is_training = (mode == tf_estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits,
         probabilities) = create_model(bert_config, is_training, input_ids,
                                       input_mask, segment_ids, label_ids,
                                       num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf_estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = tf_estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn)
        elif mode == tf_estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, logits,
                          is_real_example):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.accuracy(labels=label_ids,
                                               predictions=predictions,
                                               weights=is_real_example)
                loss = tf.metrics.mean(values=per_example_loss,
                                       weights=is_real_example)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metrics = (metric_fn, [
                per_example_loss, label_ids, logits, is_real_example
            ])
            output_spec = tf_estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            probabilities = tf.identity(probabilities, name="probabilities")
            output_spec = tf_estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold_fn=scaffold_fn)
        return output_spec
    def model(inputs, is_training):
        """Creation of the model graph."""
        with tf.variable_scope(name, 'resnet_model'):
            inputs = conv2d_fixed_padding(inputs=inputs,
                                          filters=64,
                                          kernel_size=7,
                                          strides=2,
                                          pruning_method=pruning_method,
                                          data_format=data_format,
                                          name='initial_conv')
            inputs = tf.identity(inputs, 'initial_conv')
            inputs = batch_norm_relu(inputs,
                                     is_training,
                                     data_format=data_format)

            inputs = tf.layers.max_pooling2d(inputs=inputs,
                                             pool_size=3,
                                             strides=2,
                                             padding='SAME',
                                             data_format=data_format,
                                             name='initial_max_pool')
            inputs = tf.identity(inputs, 'initial_max_pool')

            inputs = block_group(inputs=inputs,
                                 filters=64,
                                 block_fn=block_fn,
                                 blocks=lst_layers[0],
                                 strides=1,
                                 is_training=is_training,
                                 name='block_group1',
                                 pruning_method=pruning_method,
                                 data_format=data_format)
            inputs = block_group(inputs=inputs,
                                 filters=128,
                                 block_fn=block_fn,
                                 blocks=lst_layers[1],
                                 strides=2,
                                 is_training=is_training,
                                 name='block_group2',
                                 pruning_method=pruning_method,
                                 data_format=data_format)
            inputs = block_group(inputs=inputs,
                                 filters=256,
                                 block_fn=block_fn,
                                 blocks=lst_layers[2],
                                 strides=2,
                                 is_training=is_training,
                                 name='block_group3',
                                 pruning_method=pruning_method,
                                 data_format=data_format)
            inputs = block_group(inputs=inputs,
                                 filters=512,
                                 block_fn=block_fn,
                                 blocks=lst_layers[3],
                                 strides=2,
                                 is_training=is_training,
                                 name='block_group4',
                                 pruning_method=pruning_method,
                                 data_format=data_format)

            pool_size = (inputs.shape[1], inputs.shape[2])
            inputs = tf.layers.average_pooling2d(inputs=inputs,
                                                 pool_size=pool_size,
                                                 strides=1,
                                                 padding='VALID',
                                                 data_format=data_format,
                                                 name='final_avg_pool')
            inputs = tf.identity(inputs, 'final_avg_pool')
            inputs = tf.reshape(inputs, [-1, 2048])
            inputs = tf.layers.dense(
                inputs=inputs,
                units=num_classes,
                kernel_initializer=tf.random_normal_initializer(stddev=.01),
                name='final_dense')
            inputs = tf.identity(inputs, 'final_dense')
        return inputs
Ejemplo n.º 3
0
def build_model(images,
                model_name,
                training,
                override_params=None,
                model_dir=None,
                fine_tuning=False,
                features_only=False,
                pooled_features_only=False):
    """A helper function to create a model and return predicted logits.

  Args:
    images: input images tensor.
    model_name: string, the predefined model name.
    training: boolean, whether the model is constructed for training.
    override_params: A dictionary of params for overriding. Fields must exist in
      efficientnet_model.GlobalParams.
    model_dir: string, optional model dir for saving configs.
    fine_tuning: boolean, whether the model is used for finetuning.
    features_only: build the base feature network only (excluding final
      1x1 conv layer, global pooling, dropout and fc head).
    pooled_features_only: build the base network for features extraction (after
      1x1 conv layer and global pooling, but before dropout and fc head).

  Returns:
    logits: the logits tensor of classes.
    endpoints: the endpoints for each layer.

  Raises:
    When model_name specified an undefined model, raises NotImplementedError.
    When override_params has invalid fields, raises ValueError.
  """
    assert isinstance(images, tf.Tensor)
    assert not (features_only and pooled_features_only)

    # For backward compatibility.
    if override_params and override_params.get('drop_connect_rate', None):
        override_params[
            'survival_prob'] = 1 - override_params['drop_connect_rate']

    if not training or fine_tuning:
        if not override_params:
            override_params = {}
        override_params['batch_norm'] = utils.BatchNormalization
    blocks_args, global_params = get_model_params(model_name, override_params)

    if model_dir:
        param_file = os.path.join(model_dir, 'model_params.txt')
        if not tf.gfile.Exists(param_file):
            if not tf.gfile.Exists(model_dir):
                tf.gfile.MakeDirs(model_dir)
            with tf.gfile.GFile(param_file, 'w') as f:
                logging.info('writing to %s', param_file)
                f.write('model_name= %s\n\n' % model_name)
                f.write('global_params= %s\n\n' % str(global_params))
                f.write('blocks_args= %s\n\n' % str(blocks_args))

    model = efficientnet_model.Model(blocks_args, global_params, model_name)
    outputs = model(images,
                    training=training,
                    features_only=features_only,
                    pooled_features_only=pooled_features_only)
    features, endpoints = outputs[0], outputs[1:]
    if features_only:
        features = tf.identity(features, 'features')
    elif pooled_features_only:
        features = tf.identity(features, 'pooled_features')
    else:
        features = tf.identity(features, 'logits')
    return features, endpoints
Ejemplo n.º 4
0
    def testLoss(self):
        """
        Tests the loss of the FasterRCNN
        """

        # Create prediction_dict's structure
        prediction_dict_random = {
            'rpn_prediction': {},
            'classification_prediction': {
                'rcnn': {
                    'cls_score': None,
                    'bbox_offsets': None
                },
                'target': {},
                '_debug': {
                    'losses': {}
                }
            }
        }
        prediction_dict_perf = {
            'rpn_prediction': {},
            'classification_prediction': {
                'rcnn': {
                    'cls_score': None,
                    'bbox_offsets': None
                },
                'target': {},
                '_debug': {
                    'losses': {}
                }
            }
        }

        # Set seeds for stable results
        rand_seed = 13
        target_seed = 43
        image_size = (60, 80)
        num_anchors = 1000

        config = EasyDict(self.config)
        config.model.rpn.l2_regularization_scale = 0.0
        config.model.rcnn.l2_regularization_scale = 0.0
        config.model.base_network.arg_scope.weight_decay = 0.0

        #   RPN

        # Random generation of cls_targets for rpn
        # where:
        #       {-1}:   Ignore
        #       { 0}:   Background
        #       { 1}:   Object
        rpn_cls_target = tf.floor(
            tf.random_uniform([num_anchors],
                              minval=-1,
                              maxval=2,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None))

        # Creation of cls_scores with:
        #   score 100 in correct class
        #   score 0 in wrong class

        # Generation of opposite cls_score for rpn
        rpn_cls_score = tf.cast(
            tf.one_hot(tf.cast(tf.mod(tf.identity(rpn_cls_target) + 1, 2),
                               tf.int32),
                       depth=2,
                       on_value=10), tf.float32)
        # Generation of correct cls_score for rpn
        rpn_cls_perf_score = tf.cast(
            tf.one_hot(tf.cast(tf.identity(rpn_cls_target), tf.int32),
                       depth=2,
                       on_value=100), tf.float32)

        # Random generation of target bbox deltas
        rpn_bbox_target = tf.floor(
            tf.random_uniform([num_anchors, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None))

        # Random generation of predicted bbox deltas
        rpn_bbox_predictions = tf.floor(
            tf.random_uniform([num_anchors, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=rand_seed,
                              name=None))

        prediction_dict_random['rpn_prediction'][
            'rpn_cls_score'] = rpn_cls_score
        prediction_dict_random['rpn_prediction'][
            'rpn_cls_target'] = rpn_cls_target
        prediction_dict_random['rpn_prediction'][
            'rpn_bbox_target'] = rpn_bbox_target
        prediction_dict_random['rpn_prediction'][
            'rpn_bbox_pred'] = rpn_bbox_predictions

        prediction_dict_perf['rpn_prediction'][
            'rpn_cls_score'] = rpn_cls_perf_score
        prediction_dict_perf['rpn_prediction'][
            'rpn_cls_target'] = rpn_cls_target
        prediction_dict_perf['rpn_prediction'][
            'rpn_bbox_target'] = rpn_bbox_target
        prediction_dict_perf['rpn_prediction'][
            'rpn_bbox_pred'] = rpn_bbox_target

        #   RCNN

        # Set the number of classes
        num_classes = config.model.network.num_classes

        # Randomly generate the bbox_offsets for the correct class = 1
        prediction_dict_random['classification_prediction']['target'] = {
            'bbox_offsets':
            tf.random_uniform([1, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None),
            'cls': [1]
        }

        # Set the same bbox_offsets and cls for the perfect prediction
        prediction_dict_perf['classification_prediction'][
            'target'] = prediction_dict_random['classification_prediction'][
                'target'].copy()

        # Generate random scores for the num_classes + the background class
        rcnn_cls_score = tf.random_uniform([1, num_classes + 1],
                                           minval=-100,
                                           maxval=100,
                                           dtype=tf.float32,
                                           seed=rand_seed,
                                           name=None)

        # Generate a perfect prediction with the correct class score = 100
        # and the rest set to 0
        rcnn_cls_perf_score = tf.cast(
            tf.one_hot([1], depth=num_classes + 1, on_value=100), tf.float32)

        # Generate the random delta prediction for each class
        rcnn_bbox_offsets = tf.random_uniform([1, num_classes * 4],
                                              minval=-1,
                                              maxval=1,
                                              dtype=tf.float32,
                                              seed=rand_seed,
                                              name=None)

        # Copy the random prediction and set the correct class prediction
        # as the target one
        target_bbox_offsets = prediction_dict_random[
            'classification_prediction']['target']['bbox_offsets']
        initial_val = 1 * 4  # cls value * 4
        rcnn_bbox_perf_offsets = tf.Variable(
            tf.reshape(
                tf.random_uniform([1, num_classes * 4],
                                  minval=-1,
                                  maxval=1,
                                  dtype=tf.float32,
                                  seed=target_seed,
                                  name=None), [-1]))
        rcnn_bbox_perf_offsets = tf.reshape(
            tf.scatter_update(rcnn_bbox_perf_offsets,
                              tf.range(initial_val, initial_val + 4),
                              tf.reshape(target_bbox_offsets, [-1])), [1, -1])

        prediction_dict_random['classification_prediction']['rcnn'][
            'cls_score'] = rcnn_cls_score
        prediction_dict_random['classification_prediction']['rcnn'][
            'bbox_offsets'] = rcnn_bbox_offsets

        prediction_dict_perf['classification_prediction']['rcnn'][
            'cls_score'] = rcnn_cls_perf_score
        prediction_dict_perf['classification_prediction']['rcnn'][
            'bbox_offsets'] = rcnn_bbox_perf_offsets

        loss_perfect = self._get_losses(config, prediction_dict_perf,
                                        image_size)
        loss_random = self._get_losses(config, prediction_dict_random,
                                       image_size)

        loss_random_compare = {
            'rcnn_cls_loss': 5,
            'rcnn_reg_loss': 3,
            'rpn_cls_loss': 5,
            'rpn_reg_loss': 3,
            'no_reg_loss': 16,
            'regularization_loss': 0,
            'total_loss': 22,
        }
        for loss in loss_random:
            self.assertGreaterEqual(loss_random[loss],
                                    loss_random_compare[loss], loss)
            self.assertEqual(loss_perfect[loss], 0, loss)
Ejemplo n.º 5
0
    def pre_bottleneck(self, inputs, state, input_index):
        """Apply pre-bottleneck projection to inputs.

    Pre-bottleneck operation maps features of different channels into the same
    dimension. The purpose of this op is to share the features from both large
    and small models in the same LSTM cell.

    Args:
      inputs: 4D Tensor with shape [batch_size x width x height x input_size].
      state: 4D Tensor with shape [batch_size x width x height x state_size].
      input_index: integer index indicating which base features the inputs
        correspoding to.

    Returns:
      inputs: pre-bottlenecked inputs.
    Raises:
      ValueError: If pre_bottleneck is not set or inputs is not rank 4.
    """
        # Sometimes state is a tuple, in which case it cannot be modified, e.g.
        # during training, tf.contrib.training.SequenceQueueingStateSaver
        # returns the state as a tuple. This should not be an issue since we
        # only need to modify state[1] during export, when state should be a
        # list.
        if not self._pre_bottleneck:
            raise ValueError(
                'Only applied when pre_bottleneck is set to true.')
        if len(inputs.shape) != 4:
            raise ValueError('Expect a rank 4 feature tensor.')
        if not self._flatten_state and len(state.shape) != 4:
            raise ValueError('Expect rank 4 state tensor.')
        if self._flatten_state and len(state.shape) != 2:
            raise ValueError(
                'Expect rank 2 state tensor when flatten_state is set.')

        with tf.name_scope(None):
            state = tf.identity(state,
                                name='raw_inputs/init_lstm_h_%d' %
                                (input_index + 1))
        if self._flatten_state:
            batch_size = inputs.shape[0]
            height = inputs.shape[1]
            width = inputs.shape[2]
            state = tf.reshape(state, [batch_size, height, width, -1])
        with tf.variable_scope('conv_lstm_cell', reuse=tf.AUTO_REUSE):
            state_split = tf.split(state, self._groups, axis=3)
            with tf.variable_scope('bottleneck_%d' % input_index):
                bottleneck_out = []
                for k in range(self._groups):
                    with tf.variable_scope('group_%d' % k):
                        bottleneck_out.append(
                            lstm_utils.quantizable_separable_conv2d(
                                lstm_utils.quantizable_concat(
                                    [inputs, state_split[k]],
                                    axis=3,
                                    is_training=self._is_training,
                                    is_quantized=self._is_quantized,
                                    scope='quantized_concat'),
                                self.output_size[-1] / self._groups,
                                self._filter_size,
                                is_quantized=self._is_quantized,
                                depth_multiplier=1,
                                activation_fn=tf.nn.relu6,
                                normalizer_fn=None,
                                scope='project'))
                inputs = lstm_utils.quantizable_concat(
                    bottleneck_out,
                    axis=3,
                    is_training=self._is_training,
                    is_quantized=self._is_quantized,
                    scope='bottleneck_out/quantized_concat')
            # For exporting inference graph, we only mark the first timestep.
            with tf.name_scope(None):
                inputs = tf.identity(inputs,
                                     name='raw_outputs/base_endpoint_%d' %
                                     (input_index + 1))
        return inputs
Ejemplo n.º 6
0
    def _create_loss_optimizer(self):
        """
        Create the loss optimizer which will minimise the cost function

        The loss is composed of two terms:

        1. log likelihood. This is a measure of how likely the data are given the
           current posterior, i.e. how well the data fit the model using
           the inferred parameters.

        2. The latent loss. This is a measure of how closely the posterior fits the
           prior
        """
        # Generate a set of samples from the posterior [W x P x B]
        samples = self.post.sample(self.sample_size)

        #samples = tf.boolean_mask(samples, self.voxel_mask)
        #data = tf.boolean_mask(self.data_train, self.voxel_mask)

        # Part 1: Reconstruction loss
        #
        # This deals with how well the parameters replicate the data and is defined as the
        # log-likelihood of the data (given the parameters).
        #
        # This is calculated from the noise model, as it boils down to how likely the deviations
        # from the model prediction to the data are within the noise model (with its current
        # parameters)

        # Get the model prediction for the current set of parameters
        model_prediction = self._get_model_prediction(samples)

        # Unpack noise parameter - this is placed at the end of the list of parameters when
        # they are converted from internal (transformed) values to real values
        noise_samples = self.log_tf(tf.identity(tf.squeeze(self.model_samples[-1]), name="noise_samples"))

        # Note that we pass the total number of time points as we need to scale this term correctly
        # when the batch size is not the full data size
        model_prediction_voxels = self.data_model.vertices_to_voxels(model_prediction)
        noise_samples_voxels = self.data_model.vertices_to_voxels(noise_samples)
        reconstr_loss = self.noise.log_likelihood(self.data_train, model_prediction_voxels, noise_samples_voxels, self.nt_full)
        self.reconstr_loss = self.log_tf(tf.identity(reconstr_loss, name="reconstr_loss"))

        # Part 2: Latent loss
        #
        # This penalises parameters which are far from the prior
        # If both the prior and posterior are represented by an MVN we can calculate an analytic
        # expression for this cost. If not, we need to do it numerically using the posterior
        # sample obtained earlier. Note that the mean log pdf of the posterior based on sampling
        # from itself is just the distribution entropy so we allow it to be calculated without
        # sampling.
        if self.analytic_latent_loss:
            latent_loss = tf.identity(self.post.latent_loss(self.prior), name="latent_loss")
        else:
            latent_loss = tf.subtract(self.post.entropy(samples), self.prior.mean_log_pdf(samples), name="latent_loss")

        self.latent_loss = self.log_tf(latent_loss)

        # Voxelwise cost is the sum of the latent and reconstruction cost but we have the possibility
        # of gradually introducing the latent loss via the latent_weight variable. This is based on
        # the theory that you should let the model fit the data first and then allow the fit to
        # be perturbed by the priors.
        if self.latent_weight == 0:
            self.cost = tf.identity(self.reconstr_loss, name="cost")
        else:
            self.cost = tf.add(self.reconstr_loss, self.latent_weight * self.latent_loss, name="cost")

        # Combine the costs from each voxel and use a single ADAM optimizer to optimize the mean cost
        # It is also possible to optimize the total cost but this makes it harder to compare with
        # variable numbers of voxels
        self.mean_cost = tf.reduce_mean(self.cost, name="mean_cost")
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.optimize = self.optimizer.minimize(self.mean_cost, global_step=self.global_step)
Ejemplo n.º 7
0
 def _add_subgroups_to_targets(self, features, targets):
     """Adds subgroup information to targets dictionary."""
     for sensitive_column_name in self.sensitive_column_names:
         targets[sensitive_column_name] = tf.reshape(
             tf.identity(features[sensitive_column_name]), [-1, 1])
     return targets
Ejemplo n.º 8
0
 def insert_and_return_index():
     new_index = tf.cast(self._key_to_index.size(), tf.int32)
     with tf.control_dependencies([new_index]):
         add_key = self._key_to_index.insert(key, new_index)
     with tf.control_dependencies([add_key]):
         return tf.identity(new_index)
Ejemplo n.º 9
0
    def __init__(
        self,
        learning_rate,
        num_layers,
        size,
        size_layer,
        output_size,
        kernel_size=3,
        n_attn_heads=16,
        dropout=0.9,
    ):
        self.X = tf.placeholder(tf.float32, (None, None, size))
        self.Y = tf.placeholder(tf.float32, (None, output_size))

        encoder_embedded = tf.layers.dense(self.X, size_layer)

        e = tf.identity(encoder_embedded)
        for i in range(num_layers):
            z = layer(
                encoder_embedded,
                encoder_block,
                kernel_size,
                size_layer * 2,
                encoder_embedded,
            )
            z = tf.nn.dropout(z, keep_prob=dropout)
            encoder_embedded = z

        encoder_output, output_memory = z, z + e
        g = tf.identity(encoder_embedded)

        for i in range(num_layers):
            attn_res = h = layer(
                encoder_embedded,
                decoder_block,
                kernel_size,
                size_layer * 2,
                residual=tf.zeros_like(encoder_embedded),
            )
            C = []
            for j in range(n_attn_heads):
                h_ = tf.layers.dense(h, size_layer // n_attn_heads)
                g_ = tf.layers.dense(g, size_layer // n_attn_heads)
                zu_ = tf.layers.dense(encoder_output,
                                      size_layer // n_attn_heads)
                ze_ = tf.layers.dense(output_memory,
                                      size_layer // n_attn_heads)

                d = tf.layers.dense(h_, size_layer // n_attn_heads) + g_
                dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                a = tf.nn.softmax(dz)
                c_ = tf.matmul(a, ze_)
                C.append(c_)

            c = tf.concat(C, 2)
            h = tf.layers.dense(attn_res + c, size_layer)
            h = tf.nn.dropout(h, keep_prob=dropout)
            encoder_embedded = h

        encoder_embedded = tf.sigmoid(encoder_embedded[-1])
        self.logits = tf.layers.dense(encoder_embedded, output_size)
        self.cost = tf.reduce_mean(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost)
Ejemplo n.º 10
0
def deploy(config,
           model_fn,
           args=None,
           kwargs=None,
           optimizer=None,
           summarize_gradients=False):
    """Deploys a Slim-constructed model across multiple clones.

  The deployment options are specified by the config object and support
  deploying one or several clones on different GPUs and one or several replicas
  of such clones.

  The argument `model_fn` is called `config.num_clones` times to create the
  model clones as `model_fn(*args, **kwargs)`.

  The optional argument `optimizer` is an `Optimizer` object.  If not `None`,
  the deployed model is configured for training with that optimizer.

  If `config` specifies deployment on multiple replicas then the default
  tensorflow device is set appropriatly for each call to `model_fn` and for the
  slim variable creation functions: model and global variables will be created
  on the `ps` device, the clone operations will be on the `worker` device.

  Args:
    config: A `DeploymentConfig` object.
    model_fn: A callable. Called as `model_fn(*args, **kwargs)`
    args: Optional list of arguments to pass to `model_fn`.
    kwargs: Optional list of keyword arguments to pass to `model_fn`.
    optimizer: Optional `Optimizer` object.  If passed the model is deployed
      for training with that optimizer.
    summarize_gradients: Whether or not add summaries to the gradients.

  Returns:
    A `DeployedModel` namedtuple.

  """
    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Create Clones.
    clones = create_clones(config, model_fn, args, kwargs)
    first_clone = clones[0]

    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by model_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone.scope)

    train_op = None
    total_loss = None
    with tf.device(config.optimizer_device()):
        if optimizer:
            # Place the global step on the device storing the variables.
            with tf.device(config.variables_device()):
                global_step = slim.get_or_create_global_step()

            # Compute the gradients for the clones.
            total_loss, clones_gradients = optimize_clones(clones, optimizer)

            if clones_gradients:
                if summarize_gradients:
                    # Add summaries to the gradients.
                    summaries |= set(
                        _add_gradients_summaries(clones_gradients))

                # Create gradient updates.
                grad_updates = optimizer.apply_gradients(
                    clones_gradients, global_step=global_step)
                update_ops.append(grad_updates)

                update_op = tf.group(*update_ops)
                with tf.control_dependencies([update_op]):
                    train_op = tf.identity(total_loss, name='train_op')
        else:
            clones_losses = []
            regularization_losses = tf.get_collection(
                tf.GraphKeys.REGULARIZATION_LOSSES)
            for clone in clones:
                with tf.name_scope(clone.scope):
                    clone_loss = _gather_clone_loss(clone, len(clones),
                                                    regularization_losses)
                    if clone_loss is not None:
                        clones_losses.append(clone_loss)
                    # Only use regularization_losses for the first clone
                    regularization_losses = None
            if clones_losses:
                total_loss = tf.add_n(clones_losses, name='total_loss')

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone.scope))

        if total_loss is not None:
            # Add total_loss to summary.
            summaries.add(tf.summary.scalar('total_loss', total_loss))

        if summaries:
            # Merge all summaries together.
            summary_op = tf.summary.merge(list(summaries), name='summary_op')
        else:
            summary_op = None

    return DeployedModel(train_op, summary_op, total_loss, clones)
Ejemplo n.º 11
0
def define_vggish_slim(features_tensor=None, training=False):
    """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is either a tensor passed in via the optional 'features_tensor'
  argument or a placeholder created below named 'vggish/input_features'. The
  input is expected to have dtype float32 and shape [batch_size, num_frames,
  num_bands] where batch_size is variable and num_frames and num_bands are
  constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram
  patch covering num_bands frequency bands and num_frames time frames (where
  each frame step is usually 10ms). This is produced by computing the stabilized
  log(mel-spectrogram + params.LOG_OFFSET).  The output is a tensor named
  'vggish/embedding' which produces the pre-activation values of a 128-D
  embedding layer, which is usually the penultimate layer when used as part of a
  full model with a final classifier layer.

  Args:
    features_tensor: If not None, the tensor containing the input features.
      If None, a placeholder input is created.
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
    # Defaults:
    # - All weights are initialized to N(0, INIT_STDDEV).
    # - All biases are initialized to 0.
    # - All activations are ReLU.
    # - All convolutions are 3x3 with stride 1 and SAME padding.
    # - All max-pools are 2x2 with stride 2 and SAME padding.
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        weights_initializer=tf.truncated_normal_initializer(
                            stddev=params.INIT_STDDEV),
                        biases_initializer=tf.zeros_initializer(),
                        activation_fn=tf.nn.relu,
                        trainable=training), \
         slim.arg_scope([slim.conv2d],
                        kernel_size=[3, 3], stride=1, padding='SAME'), \
         slim.arg_scope([slim.max_pool2d],
                        kernel_size=[2, 2], stride=2, padding='SAME'), \
         tf.variable_scope('vggish'):
        # Input: a batch of 2-D log-mel-spectrogram patches.
        if features_tensor is None:
            features_tensor = tf.placeholder(tf.float32,
                                             shape=(None, params.NUM_FRAMES,
                                                    params.NUM_BANDS),
                                             name='input_features')
        # Reshape to 4-D so that we can convolve a batch with conv2d().
        net = tf.reshape(features_tensor,
                         [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

        # The VGG stack of alternating convolutions and max-pools.
        net = slim.conv2d(net, 64, scope='conv1')
        net = slim.max_pool2d(net, scope='pool1')
        net = slim.conv2d(net, 128, scope='conv2')
        net = slim.max_pool2d(net, scope='pool2')
        net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
        net = slim.max_pool2d(net, scope='pool3')
        net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
        net = slim.max_pool2d(net, scope='pool4')

        # Flatten before entering fully-connected layers
        net = slim.flatten(net)
        net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
        # The embedding layer.
        net = slim.fully_connected(net,
                                   params.EMBEDDING_SIZE,
                                   scope='fc2',
                                   activation_fn=None)
        return tf.identity(net, name='embedding')
Ejemplo n.º 12
0
 def sample_z_e(self):
     """Sample from the distribution of probability of the latent embeddings."""
     sample_z_e = tf.identity(self.z_e, name="z_e")
     return sample_z_e
Ejemplo n.º 13
0
    def call(self, inputs, training=True, survival_prob=None):
        """Implementation of call().

    Args:
      inputs: the inputs tensor.
      training: boolean, whether the model is constructed for training.
      survival_prob: float, between 0 to 1, drop connect rate.

    Returns:
      A output tensor.
    """
        logging.info('Block %s input shape: %s', self.name, inputs.shape)
        x = inputs

        fused_conv_fn = self._fused_conv
        expand_conv_fn = self._expand_conv
        depthwise_conv_fn = self._depthwise_conv
        project_conv_fn = self._project_conv

        if self._block_args.condconv:
            pooled_inputs = self._avg_pooling(inputs)
            routing_weights = self._routing_fn(pooled_inputs)
            # Capture routing weights as additional input to CondConv layers
            fused_conv_fn = functools.partial(self._fused_conv,
                                              routing_weights=routing_weights)
            expand_conv_fn = functools.partial(self._expand_conv,
                                               routing_weights=routing_weights)
            depthwise_conv_fn = functools.partial(
                self._depthwise_conv, routing_weights=routing_weights)
            project_conv_fn = functools.partial(
                self._project_conv, routing_weights=routing_weights)

        # creates conv 2x2 kernel
        if self._block_args.space2depth == 1:
            with tf.variable_scope('space2depth'):
                x = self._relu_fn(
                    self._bnsp(self._space2depth(x), training=training))
            logging.info('Block start with space2depth shape: %s', x.shape)

        if self._block_args.fused_conv:
            # If use fused mbconv, skip expansion and use regular conv.
            x = self._relu_fn(self._bn1(fused_conv_fn(x), training=training))
            logging.info('Conv2D shape: %s', x.shape)
        else:
            # Otherwise, first apply expansion and then apply depthwise conv.
            if self._block_args.expand_ratio != 1:
                x = self._relu_fn(
                    self._bn0(expand_conv_fn(x), training=training))
                logging.info('Expand shape: %s', x.shape)

            x = self._relu_fn(
                self._bn1(depthwise_conv_fn(x), training=training))
            logging.info('DWConv shape: %s', x.shape)

        if self._has_se:
            with tf.variable_scope('se'):
                x = self._call_se(x)

        self.endpoints = {'expansion_output': x}

        x = self._bn2(project_conv_fn(x), training=training)
        # Add identity so that quantization-aware training can insert quantization
        # ops correctly.
        x = tf.identity(x)
        if self._clip_projection_output:
            x = tf.clip_by_value(x, -6, 6)
        if self._block_args.id_skip:
            if all(s == 1
                   for s in self._block_args.strides) and inputs.get_shape(
                   ).as_list()[-1] == x.get_shape().as_list()[-1]:
                # Apply only if skip connection presents.
                if survival_prob:
                    x = utils.drop_connect(x, training, survival_prob)
                x = tf.add(x, inputs)
        logging.info('Project shape: %s', x.shape)
        return x
Ejemplo n.º 14
0
            return a + extra_losses
        else:
            return a + l

    (_, final_state), training_loss = learning_process.fold_learning_process(
        unroll_n_steps,
        loss_and_next_state_fn,
        accumulate_fn=accumulate_fn,
        start_state=current_state,
        accumulator_start_state=tf.constant(0., dtype=tf.float32),
    )

    meta_loss = (training_loss) / (tf.to_float(unroll_n_steps) *
                                   tf.to_float(extra_loss_eval))

    return tf.identity(meta_loss), nest.map_structure(tf.identity, final_state)


def make_push_op(learner, ds, failed_push, should_push, to_push, final_state,
                 pre_step_index):
    """Helper that make the op that pushes gradients, and assigns next state."""
    # This is what pushes gradient tensors to a shared location.
    push = lambda: ds.push_tensors(to_push, pre_step_index)

    def fail_push():
        pop = tf.Print(failed_push, [failed_push], "Failed to push")
        return tf.group(failed_push.assign_add(1), pop, name="fail_push")

    push_gradients_op = tf.cond(should_push, push, fail_push)

    pre_assign = tf.group(push_gradients_op, name="push_gradient_op")
Ejemplo n.º 15
0
 def preprocess(self, resized_inputs):
     return tf.identity(resized_inputs)
def build_all_reduce_iterations(all_device_tensors, tower_devices,
                                variable_mgr, num_iters):
    """Builds the all-reduce ops for multiple iterations to aggregate tensors.

  The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
  iteration aggregates the results from the previous iteration. The iterations
  are run sequentially, so the aggregations for an iteration do not start
  running until the previous iteration has completed. Each iteration after the
  first is aggregating already-aggregated values, but it does not matter because
  we are only aggregating for benchmarking purposes.

  Args:
    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
      a tensor, where t is the tower the tensor is on and i is the index of
      the tensor.
    tower_devices: A list of device strings. tower_devices[t] is the device
      of the tensors in all_device_tensors[t].
    variable_mgr: The VariableMgr to perform the all-reduce.
    num_iters: Number of iterations to aggregate tensors for.
  Returns:
    An op that when run, causes the all-reduce ops to run.
  """
    for i in range(num_iters):
        with tf.name_scope('iteration_%d' % i):
            # Step 1: Do the aggregation.
            with tf.name_scope('tensor_aggregation'):
                all_device_tensors = all_reduce(all_device_tensors,
                                                variable_mgr)

            # Step 2. Create identity ops, to bring the aggregated results back to
            # each device.
            new_all_device_tensors = []
            for device, device_tensors in zip(tower_devices,
                                              all_device_tensors):
                with tf.device(device):
                    new_all_device_tensors.append([
                        tf.identity(t, name='identity_after_allreduce')
                        for t in device_tensors
                    ])
            all_device_tensors = new_all_device_tensors

            # Step 3. Add control dependencies to delay the next iteration until this
            # iteration is complete. To avoid extra overhead, we do not have any
            # cross-device control dependencies, which means it's possible for two
            # iterations to slightly overlap.
            new_all_device_tensors = []
            for device_tensors in all_device_tensors:
                new_all_device_tensors.append([
                    control_flow_ops.with_dependencies(
                        device_tensors, t, name='identity_after_dependencies')
                    for t in device_tensors
                ])
            all_device_tensors = new_all_device_tensors

    # To prevent the dependency optimizer from removing every op we created,
    # we store the results in variables.
    ops_to_run = []
    for device, device_tensors in zip(tower_devices, all_device_tensors):
        with tf.device(device):
            for t in device_tensors:
                # The placeholder initial value is never run.
                var = tf.Variable(tf.placeholder(tf.float32, t.shape),
                                  collections=[])
                ops_to_run.append(var.assign(t))
    return tf.group(*ops_to_run)
def create_id3_embedding(videos):
    """Embeds the given videos using the Inflated 3D Convolution network.

  Downloads the graph of the I3D from tf.hub and adds it to the graph on the
  first call.

  Args:
    videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].
      Expected range is [-1, 1].

  Returns:
    embedding: <float32>[batch_size, embedding_size]. embedding_size depends
               on the model used.

  Raises:
    ValueError: when a provided embedding_layer is not supported.
  """

    batch_size = 16
    module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"

    # Making sure that we import the graph separately for
    # each different input video tensor.
    module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(
        videos.name).replace(":", "_")

    assert_ops = [
        tf.Assert(
            tf.reduce_max(videos) <= 1.001,
            ["max value in frame is > 1", videos]),
        tf.Assert(
            tf.reduce_min(videos) >= -1.001,
            ["min value in frame is < -1", videos]),
        tf.assert_equal(tf.shape(videos)[0],
                        batch_size,
                        ["invalid frame batch size: ",
                         tf.shape(videos)],
                        summarize=6),
    ]
    with tf.control_dependencies(assert_ops):
        videos = tf.identity(videos)

    module_scope = "%s_apply_default/" % module_name

    # To check whether the module has already been loaded into the graph, we look
    # for a given tensor name. If this tensor name exists, we assume the function
    # has been called before and the graph was imported. Otherwise we import it.
    # Note: in theory, the tensor could exist, but have wrong shapes.
    # This will happen if create_id3_embedding is called with a frames_placehoder
    # of wrong size/batch size, because even though that will throw a tf.Assert
    # on graph-execution time, it will insert the tensor (with wrong shape) into
    # the graph. This is why we need the following assert.
    video_batch_size = int(videos.shape[0])
    assert video_batch_size in [batch_size, -1, None], "Invalid batch size"
    tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
    if not _is_in_graph(tensor_name):
        i3d_model = hub.Module(module_spec, name=module_name)
        i3d_model(videos)

    # gets the kinetics-i3d-400-logits layer
    tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
    tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
    return tensor
Ejemplo n.º 18
0
  def model(inputs, is_training):
    """Creation of the model graph."""
    inputs = conv2d_fixed_padding(
        inputs=inputs, filters=64, kernel_size=7, strides=2,
        data_format=data_format)
    inputs = tf.identity(inputs, 'initial_conv')
    if not pre_activation:
      inputs = norm_activation(inputs, is_training, data_format=data_format,
                               layer=norm_act_layer)

    inputs = tf.layers.max_pooling2d(
        inputs=inputs, pool_size=3, strides=2, padding='SAME',
        data_format=data_format)
    inputs = tf.identity(inputs, 'initial_max_pool')

    custom_block_group = functools.partial(
        block_group,
        data_format=data_format,
        dropblock_size=dropblock_size,
        pre_activation=pre_activation,
        norm_act_layer=norm_act_layer)

    inputs = custom_block_group(
        inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0],
        strides=1, is_training=is_training, name='block_group1',
        dropblock_keep_prob=dropblock_keep_probs[0])
    inputs = custom_block_group(
        inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1],
        strides=2, is_training=is_training, name='block_group2',
        dropblock_keep_prob=dropblock_keep_probs[1])
    inputs = custom_block_group(
        inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2],
        strides=2, is_training=is_training, name='block_group3',
        dropblock_keep_prob=dropblock_keep_probs[2])
    inputs = custom_block_group(
        inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3],
        strides=2, is_training=is_training, name='block_group4',
        dropblock_keep_prob=dropblock_keep_probs[3])

    if pre_activation:
      inputs = norm_activation(inputs, is_training, data_format=data_format,
                               layer=norm_act_layer)

    # The activation is 7x7 so this is a global average pool.
    # TODO(huangyp): reduce_mean will be faster.
    if data_format == 'channels_last':
      pool_size = (inputs.shape[1], inputs.shape[2])
    else:
      pool_size = (inputs.shape[2], inputs.shape[3])
    inputs = tf.layers.average_pooling2d(
        inputs=inputs, pool_size=pool_size, strides=1, padding='VALID',
        data_format=data_format)
    inputs = tf.identity(inputs, 'final_avg_pool')
    inputs = tf.reshape(
        inputs, [-1, 2048 if block_fn is bottleneck_block else 512])
    inputs = tf.layers.dense(
        inputs=inputs,
        units=num_classes,
        kernel_initializer=tf.random_normal_initializer(stddev=.01))
    inputs = tf.identity(inputs, 'final_dense')
    return inputs
Ejemplo n.º 19
0
plt.scatter(x_data[[]], x_data[[1]])
plt.show()

# Make sure the shape and data are OK

print(x_data, "\nx_data shpae: ", x_data.shape)
print(y_data, "\ny_data shape: ", y_data.shape)

# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 3], name="input")
Y = tf.placeholder(tf.float32, shape=[None, 1], name="output")
W = tf.Variable(tf.random_normal([3, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

hypothesis = tf.matmul(X, W) + b
tf.identity(hypothesis, "hypothesis")

# Simplified cost/loss function
# Hypothesis
cost = tf.reduce_mean(tf.square(hypothesis - Y))

# Minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.000000129)
train = optimizer.minimize(cost)

# Launch the graph in a session.
sess = tf.Session()
# Initilize global variables in the graph.
sess.run(tf.global_variables_initializer())

for step in range(100000):
Ejemplo n.º 20
0
def identity(x):
  return tf.identity(x)
Ejemplo n.º 21
0
def create_train_op(total_loss,
                    optimizer,
                    global_step=_USE_GLOBAL_STEP,
                    update_ops=None,
                    variables_to_train=None,
                    transform_grads_fn=None,
                    gate_gradients=tf.train.Optimizer.GATE_OP,
                    aggregation_method=None,
                    colocate_gradients_with_ops=False,
                    check_numerics=True):
  """Creates an `Operation` that evaluates the gradients and returns the loss.

  Args:
    total_loss: A `Tensor` representing the total loss.
    optimizer: A tf.Optimizer to use for computing the gradients.
    global_step: A `Tensor` representing the global step variable. If left as
      `_USE_GLOBAL_STEP`, then tf.train.global_step() is used.
    update_ops: An optional list of updates to execute. If `update_ops` is
      `None`, then the update ops are set to the contents of the
      `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
      it doesn't contain all of the update ops in `tf.GraphKeys.UPDATE_OPS`, a
      warning will be displayed.
    variables_to_train: an optional list of variables to train. If None, it will
      default to all tf.compat.v1.trainable_variables().
    transform_grads_fn: A function which takes a single argument, a list of
      gradient to variable pairs (tuples), performs any requested gradient
      updates, such as gradient clipping or multipliers, and returns the updated
      list.
    gate_gradients: How to gate the computation of gradients. See tf.Optimizer.
    aggregation_method: Specifies the method used to combine gradient terms.
      Valid values are defined in the class `AggregationMethod`.
    colocate_gradients_with_ops: Whether or not to try colocating the gradients
      with the ops that generated them.
    check_numerics: Whether or not we apply check_numerics.

  Returns:
    A `Tensor` that when evaluated, computes the gradients and returns the total
      loss value.
  """
  if global_step is _USE_GLOBAL_STEP:  # pylint: disable=g-int-id-comparison
    # global_step can be None when passed into the optimizer in case we do not
    # want apply_gradients to factor that in. This is different from default
    # behaviour where we use the standard global step.
    global_step = tf.train.get_or_create_global_step()

  # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None.
  global_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
  if update_ops is None:
    update_ops = global_update_ops
  else:
    update_ops = set(update_ops)
  if not global_update_ops.issubset(update_ops):
    tf.logging.warning('update_ops in create_train_op does not contain all the '
                       'update_ops in GraphKeys.UPDATE_OPS')

  # Make sure update_ops are computed before total_loss.
  if update_ops:
    with tf.control_dependencies(update_ops):
      barrier = tf.no_op(name='update_barrier')
    with tf.control_dependencies([barrier]):
      total_loss = tf.identity(total_loss)

  if variables_to_train is None:
    # Default to tf.compat.v1.trainable_variables()
    variables_to_train = tf.trainable_variables()
  else:
    # Make sure that variables_to_train are in
    # tf.compat.v1.trainable_variables()
    for v in variables_to_train:
      assert v.trainable or v in tf.trainable_variables()

  assert variables_to_train

  # Create the gradients. Note that apply_gradients adds the gradient
  # computation to the current graph.
  grads = optimizer.compute_gradients(
      total_loss,
      variables_to_train,
      gate_gradients=gate_gradients,
      aggregation_method=aggregation_method,
      colocate_gradients_with_ops=colocate_gradients_with_ops)

  if transform_grads_fn:
    grads = transform_grads_fn(grads)

  # Create gradient updates.
  grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

  with tf.name_scope('train_op'):
    # Make sure total_loss is valid.
    if check_numerics:
      total_loss = tf.check_numerics(total_loss,
                                     'LossTensor is inf or nan')

    # Ensure the train_tensor computes grad_updates.
    with tf.control_dependencies([grad_updates]):
      train_op = tf.identity(total_loss)

  # Add the operation used for training to the 'train_op' collection
  train_ops = tf.get_collection_ref(tf.GraphKeys.TRAIN_OP)
  if train_op not in train_ops:
    train_ops.append(train_op)

  return train_op
Ejemplo n.º 22
0
def main(args):

    network = importlib.import_module(args.model_def)

    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
    if not os.path.isdir(
            log_dir):  # Create the log directory if it doesn't exist
        os.makedirs(log_dir)
    model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)
    if not os.path.isdir(
            model_dir):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir)

    # Write arguments to a text file
    facenet.write_arguments_to_file(args, os.path.join(log_dir,
                                                       'arguments.txt'))

    # Store some git revision info in a text file in the log directory
    src_path, _ = os.path.split(os.path.realpath(__file__))
    facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv))

    np.random.seed(seed=args.seed)
    train_set = facenet.get_dataset(args.data_dir)

    print('Model directory: %s' % model_dir)
    print('Log directory: %s' % log_dir)
    if args.pretrained_model:
        print('Pre-trained model: %s' %
              os.path.expanduser(args.pretrained_model))

    if args.lfw_dir:
        print('LFW directory: %s' % args.lfw_dir)
        # Read the file containing the pairs used for testing
        pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
        # Get the paths for the corresponding images
        lfw_paths, actual_issame = lfw.get_paths(
            os.path.expanduser(args.lfw_dir), pairs)

    with tf.Graph().as_default():
        tf.set_random_seed(args.seed)
        global_step = tf.Variable(0, trainable=False)

        # Placeholder for the learning rate
        learning_rate_placeholder = tf.placeholder(tf.float32,
                                                   name='learning_rate')

        batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')

        phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')

        image_paths_placeholder = tf.placeholder(tf.string,
                                                 shape=(None, 3),
                                                 name='image_paths')
        labels_placeholder = tf.placeholder(tf.int64,
                                            shape=(None, 3),
                                            name='labels')

        input_queue = data_flow_ops.FIFOQueue(capacity=100000,
                                              dtypes=[tf.string, tf.int64],
                                              shapes=[(3, ), (3, )],
                                              shared_name=None,
                                              name=None)
        enqueue_op = input_queue.enqueue_many(
            [image_paths_placeholder, labels_placeholder])

        nrof_preprocess_threads = 4
        images_and_labels = []
        for _ in range(nrof_preprocess_threads):
            filenames, label = input_queue.dequeue()
            images = []
            for filename in tf.unstack(filenames):
                file_contents = tf.read_file(filename)
                image = tf.image.decode_image(file_contents, channels=3)
                image = tf.to_float(image)
                if args.random_crop:
                    image = tf.random_crop(
                        image, [args.image_size, args.image_size, 3])
                else:
                    image = tf.image.resize_image_with_crop_or_pad(
                        image, args.image_size, args.image_size)
                if args.random_flip:
                    image = tf.image.random_flip_left_right(image)

                #pylint: disable=no-member
                image.set_shape((args.image_size, args.image_size, 3))
                images.append(tf.image.per_image_standardization(image))
            images_and_labels.append([images, label])

        image_batch, labels_batch = tf.train.batch_join(
            images_and_labels,
            batch_size=batch_size_placeholder,
            shapes=[(args.image_size, args.image_size, 3), ()],
            enqueue_many=True,
            capacity=4 * nrof_preprocess_threads * args.batch_size,
            allow_smaller_final_batch=True)
        image_batch = tf.identity(image_batch, 'image_batch')
        image_batch = tf.identity(image_batch, 'input')
        labels_batch = tf.identity(labels_batch, 'label_batch')

        # Build the inference graph
        prelogits, _ = network.inference(
            image_batch,
            args.keep_probability,
            phase_train=phase_train_placeholder,
            bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)

        embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings')
        # Split embeddings into anchor, positive and negative and calculate triplet loss
        anchor, positive, negative = tf.unstack(
            tf.reshape(embeddings, [-1, 3, args.embedding_size]), 3, 1)
        triplet_loss = facenet.triplet_loss(anchor, positive, negative,
                                            args.alpha)

        learning_rate = tf.train.exponential_decay(
            learning_rate_placeholder,
            global_step,
            args.learning_rate_decay_epochs * args.epoch_size,
            args.learning_rate_decay_factor,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)

        # Calculate the total losses
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = tf.add_n([triplet_loss] + regularization_losses,
                              name='total_loss')

        # Build a Graph that trains the model with one batch of examples and updates the model parameters
        train_op = facenet.train(total_loss, global_step, args.optimizer,
                                 learning_rate, args.moving_average_decay,
                                 tf.global_variables())

        # Create a saver
        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        # Initialize variables
        sess.run(tf.global_variables_initializer(),
                 feed_dict={phase_train_placeholder: True})
        sess.run(tf.local_variables_initializer(),
                 feed_dict={phase_train_placeholder: True})

        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord, sess=sess)

        with sess.as_default():

            if args.pretrained_model:
                print('Restoring pretrained model: %s' % args.pretrained_model)
                saver.restore(sess, os.path.expanduser(args.pretrained_model))

            # Training and validation loop
            epoch = 0
            while epoch < args.max_nrof_epochs:
                step = sess.run(global_step, feed_dict=None)
                epoch = step // args.epoch_size
                # Train for one epoch
                train(args, sess, train_set, epoch, image_paths_placeholder,
                      labels_placeholder, labels_batch, batch_size_placeholder,
                      learning_rate_placeholder, phase_train_placeholder,
                      enqueue_op, input_queue, global_step, embeddings,
                      total_loss, train_op, summary_op, summary_writer,
                      args.learning_rate_schedule_file, args.embedding_size,
                      anchor, positive, negative, triplet_loss)

                # Save variables and the metagraph if it doesn't exist already
                save_variables_and_metagraph(sess, saver, summary_writer,
                                             model_dir, subdir, step)

                # Evaluate on LFW
                if args.lfw_dir:
                    evaluate(sess, lfw_paths, embeddings, labels_batch,
                             image_paths_placeholder, labels_placeholder,
                             batch_size_placeholder, learning_rate_placeholder,
                             phase_train_placeholder, enqueue_op,
                             actual_issame, args.batch_size,
                             args.lfw_nrof_folds, log_dir, step,
                             summary_writer, args.embedding_size)

    return model_dir
Ejemplo n.º 23
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM) with bottlenecking.

    Includes logic for quantization-aware training. Note that all concats and
    activations use fixed ranges unless stated otherwise.

    Args:
      inputs: Input tensor at the current timestep.
      state: Tuple of tensors, the state at the previous timestep.
      scope: Optional scope.

    Returns:
      A tuple where the first element is the LSTM output and the second is
      a LSTMStateTuple of the state at the current timestep.
    """
        scope = scope or 'conv_lstm_cell'
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            c, h = state

            # Set nodes to be under raw_inputs/ name scope for tfmini export.
            with tf.name_scope(None):
                c = tf.identity(c, name='raw_inputs/init_lstm_c')
                # When pre_bottleneck is enabled, input h handle is in rnn_decoder.py
                if not self._pre_bottleneck:
                    h = tf.identity(h, name='raw_inputs/init_lstm_h')

            # unflatten state if necessary
            if self._flatten_state:
                c = tf.reshape(c, [-1] + self.output_size)
                h = tf.reshape(h, [-1] + self.output_size)

            c_list = tf.split(c, self._groups, axis=3)
            if self._pre_bottleneck:
                inputs_list = tf.split(inputs, self._groups, axis=3)
            else:
                h_list = tf.split(h, self._groups, axis=3)
            out_bottleneck = []
            out_c = []
            out_h = []
            # summary of input passed into cell
            if self._viz_gates:
                slim.summaries.add_histogram_summary(inputs, 'cell_input')

            for k in range(self._groups):
                if self._pre_bottleneck:
                    bottleneck = inputs_list[k]
                else:
                    if self._use_batch_norm:
                        b_x = lstm_utils.quantizable_separable_conv2d(
                            inputs,
                            self._num_units // self._groups,
                            self._filter_size,
                            is_quantized=self._is_quantized,
                            depth_multiplier=1,
                            activation_fn=None,
                            normalizer_fn=None,
                            scope='bottleneck_%d_x' % k)
                        b_h = lstm_utils.quantizable_separable_conv2d(
                            h_list[k],
                            self._num_units // self._groups,
                            self._filter_size,
                            is_quantized=self._is_quantized,
                            depth_multiplier=1,
                            activation_fn=None,
                            normalizer_fn=None,
                            scope='bottleneck_%d_h' % k)
                        b_x = slim.batch_norm(b_x,
                                              scale=True,
                                              is_training=self._is_training,
                                              scope='BatchNorm_%d_X' % k)
                        b_h = slim.batch_norm(b_h,
                                              scale=True,
                                              is_training=self._is_training,
                                              scope='BatchNorm_%d_H' % k)
                        bottleneck = b_x + b_h
                    else:
                        # All concats use fixed quantization ranges to prevent rescaling
                        # at inference. Both |inputs| and |h_list| are tensors resulting
                        # from Relu6 operations so we fix the ranges to [0, 6].
                        bottleneck_concat = lstm_utils.quantizable_concat(
                            [inputs, h_list[k]],
                            axis=3,
                            is_training=False,
                            is_quantized=self._is_quantized,
                            scope='bottleneck_%d/quantized_concat' % k)

                        bottleneck = lstm_utils.quantizable_separable_conv2d(
                            bottleneck_concat,
                            self._num_units // self._groups,
                            self._filter_size,
                            is_quantized=self._is_quantized,
                            depth_multiplier=1,
                            activation_fn=self._activation,
                            normalizer_fn=None,
                            scope='bottleneck_%d' % k)

                concat = lstm_utils.quantizable_separable_conv2d(
                    bottleneck,
                    4 * self._num_units // self._groups,
                    self._filter_size,
                    is_quantized=self._is_quantized,
                    depth_multiplier=1,
                    activation_fn=None,
                    normalizer_fn=None,
                    scope='concat_conv_%d' % k)

                # Since there is no activation in the previous separable conv, we
                # quantize here. A starting range of [-6, 6] is used because the
                # tensors are input to a Sigmoid function that saturates at these
                # ranges.
                concat = lstm_utils.quantize_op(
                    concat,
                    is_training=self._is_training,
                    default_min=-6,
                    default_max=6,
                    is_quantized=self._is_quantized,
                    scope='gates_%d/act_quant' % k)

                # i = input_gate, j = new_input, f = forget_gate, o = output_gate
                i, j, f, o = tf.split(concat, 4, 3)

                f_add = f + self._forget_bias
                f_add = lstm_utils.quantize_op(
                    f_add,
                    is_training=self._is_training,
                    default_min=-6,
                    default_max=6,
                    is_quantized=self._is_quantized,
                    scope='forget_gate_%d/add_quant' % k)
                f_act = tf.sigmoid(f_add)
                # The quantization range is fixed for the sigmoid to ensure that zero
                # is exactly representable.
                f_act = lstm_utils.fixed_quantize_op(
                    f_act,
                    fixed_min=0.0,
                    fixed_max=1.0,
                    is_quantized=self._is_quantized,
                    scope='forget_gate_%d/act_quant' % k)

                a = c_list[k] * f_act
                a = lstm_utils.quantize_op(a,
                                           is_training=self._is_training,
                                           is_quantized=self._is_quantized,
                                           scope='forget_gate_%d/mul_quant' %
                                           k)

                i_act = tf.sigmoid(i)
                # The quantization range is fixed for the sigmoid to ensure that zero
                # is exactly representable.
                i_act = lstm_utils.fixed_quantize_op(
                    i_act,
                    fixed_min=0.0,
                    fixed_max=1.0,
                    is_quantized=self._is_quantized,
                    scope='input_gate_%d/act_quant' % k)

                j_act = self._activation(j)
                # The quantization range is fixed for the relu6 to ensure that zero
                # is exactly representable.
                j_act = lstm_utils.fixed_quantize_op(
                    j_act,
                    fixed_min=0.0,
                    fixed_max=6.0,
                    is_quantized=self._is_quantized,
                    scope='new_input_%d/act_quant' % k)

                b = i_act * j_act
                b = lstm_utils.quantize_op(b,
                                           is_training=self._is_training,
                                           is_quantized=self._is_quantized,
                                           scope='input_gate_%d/mul_quant' % k)

                new_c = a + b
                # The quantization range is fixed to [0, 6] due to an optimization in
                # TFLite. The order of operations is as fllows:
                #     Add -> FakeQuant -> Relu6 -> FakeQuant -> Concat.
                # The fakequant ranges to the concat must be fixed to ensure all inputs
                # to the concat have the same range, removing the need for rescaling.
                # The quantization ranges input to the relu6 are propagated to its
                # output. Any mismatch between these two ranges will cause an error.
                new_c = lstm_utils.fixed_quantize_op(
                    new_c,
                    fixed_min=0.0,
                    fixed_max=6.0,
                    is_quantized=self._is_quantized,
                    scope='new_c_%d/add_quant' % k)

                if not self._is_quantized:
                    if self._scale_state:
                        normalizer = tf.maximum(
                            1.0,
                            tf.reduce_max(new_c, axis=(1, 2, 3)) / 6)
                        new_c /= tf.reshape(normalizer,
                                            [tf.shape(new_c)[0], 1, 1, 1])
                    elif self._clip_state:
                        new_c = tf.clip_by_value(new_c, -6, 6)

                new_c_act = self._activation(new_c)
                # The quantization range is fixed for the relu6 to ensure that zero
                # is exactly representable.
                new_c_act = lstm_utils.fixed_quantize_op(
                    new_c_act,
                    fixed_min=0.0,
                    fixed_max=6.0,
                    is_quantized=self._is_quantized,
                    scope='new_c_%d/act_quant' % k)

                o_act = tf.sigmoid(o)
                # The quantization range is fixed for the sigmoid to ensure that zero
                # is exactly representable.
                o_act = lstm_utils.fixed_quantize_op(
                    o_act,
                    fixed_min=0.0,
                    fixed_max=1.0,
                    is_quantized=self._is_quantized,
                    scope='output_%d/act_quant' % k)

                new_h = new_c_act * o_act
                # The quantization range is fixed since it is input to a concat.
                # A range of [0, 6] is used since |new_h| is a product of ranges [0, 6]
                # and [0, 1].
                new_h_act = lstm_utils.fixed_quantize_op(
                    new_h,
                    fixed_min=0.0,
                    fixed_max=6.0,
                    is_quantized=self._is_quantized,
                    scope='new_h_%d/act_quant' % k)

                out_bottleneck.append(bottleneck)
                out_c.append(new_c_act)
                out_h.append(new_h_act)

            # Since all inputs to the below concats are already quantized, we can use
            # a regular concat operation.
            new_c = tf.concat(out_c, axis=3)
            new_h = tf.concat(out_h, axis=3)

            # |bottleneck| is input to a concat with |new_h|. We must use
            # quantizable_concat() with a fixed range that matches |new_h|.
            bottleneck = lstm_utils.quantizable_concat(
                out_bottleneck,
                axis=3,
                is_training=False,
                is_quantized=self._is_quantized,
                scope='out_bottleneck/quantized_concat')

            # summary of cell output and new state
            if self._viz_gates:
                slim.summaries.add_histogram_summary(new_h, 'cell_output')
                slim.summaries.add_histogram_summary(new_c, 'cell_state')

            output = new_h
            if self._output_bottleneck:
                output = lstm_utils.quantizable_concat(
                    [new_h, bottleneck],
                    axis=3,
                    is_training=False,
                    is_quantized=self._is_quantized,
                    scope='new_output/quantized_concat')

            # reflatten state to store it
            if self._flatten_state:
                new_c = tf.reshape(new_c, [-1, self._param_count],
                                   name='lstm_c')
                new_h = tf.reshape(new_h, [-1, self._param_count],
                                   name='lstm_h')

            # Set nodes to be under raw_outputs/ name scope for tfmini export.
            with tf.name_scope(None):
                new_c = tf.identity(new_c, name='raw_outputs/lstm_c')
                new_h = tf.identity(new_h, name='raw_outputs/lstm_h')
            states_and_output = contrib_rnn.LSTMStateTuple(new_c, new_h)

            return output, states_and_output
Ejemplo n.º 24
0
def make_estimator(mode,
                   loss=None,
                   eval_metrics=None,
                   predictions=None,
                   predict_fn=None,
                   predict_input=None,
                   train_scalar_summaries=None,
                   polyak_averaging=False):
    """Returns an EstimatorSpec (maybe TPU) for all modes."""

    # Always use TPUEstimator, even when not using TPU, then it's (almost) no-op.
    spec_type = TPUEstimatorSpec

    if mode == tf.estimator.ModeKeys.PREDICT:
        # For backwards-compatibility, still accept `predictions`.
        if predictions is None:
            # What we do here is create the hub module and use its predictions.
            assert predict_fn is not None, 'Need to pass `predict_fn` arg.'
            assert predict_input is not None, 'Need to pass `predict_input` arg.'
            tf_hub_module = make_hub_predictor(predict_fn)
            predictions = tf_hub_module(predict_input)

        if polyak_averaging:
            with eval_ema_scope():
                # Use `tf.identity` to ensure that the dependencies are executed first.
                # (Otherwise, since loss is constructed outside of this function, the
                # `eval_ema_scope` scope would do nothing)
                predictions = tf.identity(predictions)

        return spec_type(mode=mode, predictions=predictions)

    if mode == tf.estimator.ModeKeys.EVAL:
        if polyak_averaging:
            with eval_ema_scope():
                # Use `tf.identity` to ensure that the dependencies are executed first.
                # (Otherwise, since loss is constructed outside of this function, the
                # `eval_ema_scope()` scope would do nothing)
                loss = tf.identity(loss)

                # `eval_metrics` is an ordered pair of a lambda, and a list of tensors
                # that are evaluated and fed into the lambda. Do "surgery" to wrap only
                # the tensors into `tf.identity` (see comment above)
                eval_metrics = (
                    eval_metrics[0],
                    [tf.identity(x) for x in eval_metrics[1]],
                )

        return spec_type(mode=mode, loss=loss, eval_metrics=eval_metrics)

    if mode == tf.estimator.ModeKeys.TRAIN:
        assert loss is not None, 'Need to pass `loss` arg.'
        trainer = Trainer(update_batchnorm_params=True)

        if polyak_averaging:
            # Set EMA half-life to one epoch
            ema_decay = 0.5**(1.0 / trainer.steps_per_epoch)
            ema = tf.train.ExponentialMovingAverage(ema_decay,
                                                    zero_debias=True,
                                                    name='EMA')

        if FLAGS.use_summaries:
            # Need to reshape with a fake batch for summaries on TPU host.
            # Also need to explicitly note which tensors are used, and pass
            # them in explicitly.
            summary_names = ['lr', 'loss']
            summary_reshaped_tensors = [
                tf.reshape(trainer.lr, [1]),
                tf.reshape(loss, [1])
            ]

            if train_scalar_summaries is not None:
                for name, summary_tensor in train_scalar_summaries.items():
                    summary_names.append(name)
                    summary_reshaped_tensors.append(
                        tf.reshape(summary_tensor, [1]))

            def host_call_fn(gs, *summary_tensors):
                gs = gs[0]
                with contrib_summary.create_file_writer(
                        FLAGS.workdir).as_default():
                    with contrib_summary.always_record_summaries():
                        for name, reshaped_tensor in zip(
                                summary_names, summary_tensors):
                            contrib_summary.scalar(
                                name, tf.reduce_mean(reshaped_tensor), step=gs)
                        return contrib_summary.all_summary_ops()

            gs_t = tf.reshape(tf.train.get_global_step(), [1])
            host_call = (host_call_fn, [gs_t] + summary_reshaped_tensors)
        else:
            host_call = None

        train_op = trainer.get_train_op(loss,
                                        use_tpu=FLAGS.tpu_name is not None)
        if polyak_averaging:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(tf.trainable_variables())

        return spec_type(mode=mode,
                         loss=loss,
                         train_op=train_op,
                         host_call=host_call)

    raise ValueError('Unsupported mode %s' % mode)
Ejemplo n.º 25
0
def expanded_conv(input_tensor,
                  num_outputs,
                  expansion_size=expand_input_by_factor(6),
                  stride=1,
                  rate=1,
                  kernel_size=(3, 3),
                  residual=True,
                  normalizer_fn=None,
                  split_projection=1,
                  split_expansion=1,
                  split_divisible_by=8,
                  expansion_transform=None,
                  depthwise_location='expansion',
                  depthwise_channel_multiplier=1,
                  endpoints=None,
                  use_explicit_padding=False,
                  padding='SAME',
                  inner_activation_fn=None,
                  depthwise_activation_fn=None,
                  project_activation_fn=tf.identity,
                  depthwise_fn=slim.separable_conv2d,
                  expansion_fn=split_conv,
                  projection_fn=split_conv,
                  depthwise_params=None,
                  expansion_params=None,
                  projection_params=None,
                  scope=None):
    """Depthwise Convolution Block with expansion.

  Builds a composite convolution that has the following structure
  expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)

  Args:
    input_tensor: input
    num_outputs: number of outputs in the final layer.
    expansion_size: the size of expansion, could be a constant or a callable.
      If latter it will be provided 'num_inputs' as an input. For forward
      compatibility it should accept arbitrary keyword arguments.
      Default will expand the input by factor of 6.
    stride: depthwise stride
    rate: depthwise rate
    kernel_size: depthwise kernel
    residual: whether to include residual connection between input
      and output.
    normalizer_fn: batchnorm or otherwise
    split_projection: how many ways to split projection operator
      (that is conv expansion->bottleneck)
    split_expansion: how many ways to split expansion op
      (that is conv bottleneck->expansion) ops will keep depth divisible
      by this value.
    split_divisible_by: make sure every split group is divisible by this number.
    expansion_transform: Optional function that takes expansion
      as a single input and returns output.
    depthwise_location: where to put depthwise covnvolutions supported
      values None, 'input', 'output', 'expansion'
    depthwise_channel_multiplier: depthwise channel multiplier:
    each input will replicated (with different filters)
    that many times. So if input had c channels,
    output will have c x depthwise_channel_multpilier.
    endpoints: An optional dictionary into which intermediate endpoints are
      placed. The keys "expansion_output", "depthwise_output",
      "projection_output" and "expansion_transform" are always populated, even
      if the corresponding functions are not invoked.
    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
      inputs so that the output dimensions are the same as if 'SAME' padding
      were used.
    padding: Padding type to use if `use_explicit_padding` is not set.
    inner_activation_fn: activation function to use in all inner convolutions.
    If none, will rely on slim default scopes.
    depthwise_activation_fn: activation function to use for deptwhise only.
      If not provided will rely on slim default scopes. If both
      inner_activation_fn and depthwise_activation_fn are provided,
      depthwise_activation_fn takes precedence over inner_activation_fn.
    project_activation_fn: activation function for the project layer.
    (note this layer is not affected by inner_activation_fn)
    depthwise_fn: Depthwise convolution function.
    expansion_fn: Expansion convolution function. If use custom function then
      "split_expansion" and "split_divisible_by" will be ignored.
    projection_fn: Projection convolution function. If use custom function then
      "split_projection" and "split_divisible_by" will be ignored.

    scope: optional scope.

  Returns:
    Tensor of depth num_outputs

  Raises:
    TypeError: on inval
  """
    conv_defaults = {}
    dw_defaults = {}
    if inner_activation_fn is not None:
        conv_defaults['activation_fn'] = inner_activation_fn
        dw_defaults['activation_fn'] = inner_activation_fn
    if depthwise_activation_fn is not None:
        dw_defaults['activation_fn'] = depthwise_activation_fn
    # pylint: disable=g-backslash-continuation
    with tf.variable_scope(scope, default_name='expanded_conv') as s, \
         tf.name_scope(s.original_name_scope), \
        slim.arg_scope((slim.conv2d,), **conv_defaults), \
         slim.arg_scope((slim.separable_conv2d,), **dw_defaults):
        prev_depth = input_tensor.get_shape().as_list()[3]
        if depthwise_location not in [None, 'input', 'output', 'expansion']:
            raise TypeError('%r is unknown value for depthwise_location' %
                            depthwise_location)
        if use_explicit_padding:
            if padding != 'SAME':
                raise TypeError(
                    '`use_explicit_padding` should only be used with '
                    '"SAME" padding.')
            padding = 'VALID'
        depthwise_func = functools.partial(
            depthwise_fn,
            num_outputs=None,
            kernel_size=kernel_size,
            depth_multiplier=depthwise_channel_multiplier,
            stride=stride,
            rate=rate,
            normalizer_fn=normalizer_fn,
            padding=padding,
            scope='depthwise',
            **depthwise_params)
        # b1 -> b2 * r -> b2
        #   i -> (o * r) (bottleneck) -> o
        input_tensor = tf.identity(input_tensor, 'input')
        net = input_tensor

        if depthwise_location == 'input':
            if use_explicit_padding:
                net = _fixed_padding(net, kernel_size, rate)
            net = depthwise_func(net, activation_fn=None)
            net = tf.identity(net, name='depthwise_output')
            if endpoints is not None:
                endpoints['depthwise_output'] = net

        if callable(expansion_size):
            inner_size = expansion_size(num_inputs=prev_depth)
        else:
            inner_size = expansion_size

        if inner_size > net.shape[3]:
            if expansion_fn == split_conv:
                expansion_fn = functools.partial(
                    expansion_fn,
                    num_ways=split_expansion,
                    divisible_by=split_divisible_by,
                    stride=1)
            net = expansion_fn(net,
                               inner_size,
                               scope='expand',
                               normalizer_fn=normalizer_fn,
                               **expansion_params)
            net = tf.identity(net, 'expansion_output')
            if endpoints is not None:
                endpoints['expansion_output'] = net

        if depthwise_location == 'expansion':
            if use_explicit_padding:
                net = _fixed_padding(net, kernel_size, rate)
            net = depthwise_func(net)
            net = tf.identity(net, name='depthwise_output')
            if endpoints is not None:
                endpoints['depthwise_output'] = net

        if expansion_transform:
            net = expansion_transform(expansion_tensor=net,
                                      input_tensor=input_tensor)
        # Note in contrast with expansion, we always have
        # projection to produce the desired output size.
        if projection_fn == split_conv:
            projection_fn = functools.partial(projection_fn,
                                              num_ways=split_projection,
                                              divisible_by=split_divisible_by,
                                              stride=1)
        net = projection_fn(net,
                            num_outputs,
                            scope='project',
                            normalizer_fn=normalizer_fn,
                            activation_fn=project_activation_fn,
                            **projection_params)
        if endpoints is not None:
            endpoints['projection_output'] = net
        if depthwise_location == 'output':
            if use_explicit_padding:
                net = _fixed_padding(net, kernel_size, rate)
            net = depthwise_func(net, activation_fn=None)
            net = tf.identity(net, name='depthwise_output')
            if endpoints is not None:
                endpoints['depthwise_output'] = net

        if callable(residual):  # custom residual
            net = residual(input_tensor=input_tensor, output_tensor=net)
        elif (residual and
              # stride check enforces that we don't add residuals when spatial
              # dimensions are None
              stride == 1 and
              # Depth matches
              net.get_shape().as_list()[3]
              == input_tensor.get_shape().as_list()[3]):
            net += input_tensor
        return tf.identity(net, name='output')
Ejemplo n.º 26
0
    def __init__(self, observation_size, net_arch, initializer, activation,
                 clip_range, value_coef, entropy_coef, learning_rate,
                 pre_training_learning_rate, action_bounds, policy):
        """
        :param observation_size:
        :param net_arch:
        :param initializer:
        :param activation:
        :param clip_range:
        :param value_coef:
        :param entropy_coef:
        :param learning_rate:
        :param pre_training_learning_rate:
        :param action_bounds:
        :param policy:
        """
        """Set class constants"""
        self.observation_size = observation_size
        self.net_arch = net_arch
        self.initializer = initializer
        self.activation = activation
        self.clip_range = clip_range
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef

        if action_bounds is None:
            action_bounds = [0.0, 1.5]
        self.action_bounds = action_bounds
        self.learning_rate = learning_rate
        self.pre_training_learning_rate = pre_training_learning_rate

        if policy is None:
            policy = GaussFull()
        self.policy = policy
        """Set up the tensorflow graph"""
        self.graph = Graph()

        with self.graph.as_default():
            self.sess = Session(graph=self.graph)
            """ core """
            # place holders
            self.observation_string_ph = placeholder(
                shape=(None, 1), dtype=string, name="observation_string_ph")
            self.action_ph = placeholder(dtype=float32,
                                         shape=(None, 1),
                                         name="action_ph")
            self.old_neg_logits = placeholder(dtype=float32,
                                              shape=(None, 1),
                                              name="old_neg_logits")
            self.advantage_ph = placeholder(dtype=float32,
                                            shape=(None, 1),
                                            name="advantage_ph")
            self.value_target_ph = placeholder(dtype=float32,
                                               shape=(None, 1),
                                               name="value_target_ph")
            # learning rate tensors
            self.learning_rate_ph = placeholder_with_default(
                input=self.learning_rate, shape=())
            self.pre_training_learning_rate_ph = placeholder_with_default(
                input=self.pre_training_learning_rate, shape=())

            # observation tensor
            replaced1 = regex_replace(self.observation_string_ph, "/", "_")
            replaced2 = regex_replace(replaced1, r"\+", "-")
            byte_tensor = decode_base64(replaced2)
            decoded = decode_raw(byte_tensor, out_type=float32)
            squeezed = squeeze(decoded, axis=1)
            self.observation_input = ensure_shape(
                squeezed,
                shape=(None, self.observation_size),
                name="observation_input")

            # policy net
            latent_policy = net_core(self.observation_input, self.net_arch,
                                     self.initializer, self.activation)
            self.policy.construct(latent_policy=latent_policy)

            self.clipped_action = clip_by_value(
                cast(self.policy.action, float32), self.action_bounds[0],
                self.action_bounds[1], "clipped_action")

            # value net
            latent_value = net_core(self.observation_input, self.net_arch,
                                    self.initializer, self.activation)
            self.value = identity(
                input=Dense(units=1,
                            activation=None,
                            kernel_initializer=self.initializer)(latent_value),
                name="value")
            """loss calculation"""
            # policy loss
            self.neg_logits = self.policy.neg_logits_from_actions(
                self.action_ph)
            ratio = exp(self.old_neg_logits - self.neg_logits)

            standardized_adv = (self.advantage_ph - reduce_mean(
                self.advantage_ph)) / (reduce_std(self.advantage_ph) + 1e-8)
            raw_policy_loss = -standardized_adv * ratio
            clipped_policy_loss = -standardized_adv * clip_by_value(
                ratio, 1 - self.clip_range, 1 + self.clip_range)
            self.policy_loss = reduce_mean(
                maximum(raw_policy_loss, clipped_policy_loss))

            self.value_loss = mean_squared_error(self.value_target_ph,
                                                 self.value)

            # entropy loss
            self.entropy_loss = -reduce_mean(self.policy.entropy)

            # total loss
            self.total_loss = self.policy_loss + self.value_coef * self.value_loss + self.entropy_coef * self.entropy_loss

            # optimizer
            optimizer = AdamOptimizer(learning_rate=self.learning_rate_ph)

            # training ops
            self.training_op = optimizer.minimize(self.total_loss)

            # pre training
            self.dist_param_target_ph = placeholder(
                dtype=float32,
                shape=(None, self.policy.dist_params.shape[1]),
                name="dist_param_label_ph")
            self.pre_training_loss = mean_squared_error(
                self.dist_param_target_ph, self.policy.dist_params)
            pre_training_optimizer = GradientDescentOptimizer(
                learning_rate=self.pre_training_learning_rate_ph)
            self.pre_training_op = pre_training_optimizer.minimize(
                self.pre_training_loss)
            """utility nodes"""
            # inspect model weights
            self.trainable_variables = trainable_variables()

            # saviour
            self.saver = Saver()

            # tensorboard summaries
            self.summary = merge([
                histogram("values", self.value),
                histogram("advantages", standardized_adv),
                histogram("actions", self.clipped_action),
                histogram("det_actions",
                          replace_nan(self.policy.det_action, 0.0)),
                histogram("value_targets", self.value_target_ph),
                scalar("policy_loss", self.policy_loss),
                scalar("value_loss", self.value_loss),
                scalar("entropy_loss", self.entropy_loss)
            ])

            self.pre_summary = merge([
                histogram("pretraining_actions", self.clipped_action),
                scalar("pretraining_loss", self.pre_training_loss)
            ])

            # initialization
            init = global_variables_initializer()
            self.sess.run(init)
Ejemplo n.º 27
0
def model_fn(features, labels, mode, params):
	"""The model_fn argument for creating an Estimator."""
	tf.logging.info("features = %s labels = %s mode = %s params=%s" %
					(features, labels, mode, params))
	global_step = tf.train.get_global_step()
	graph = mtf.Graph()
	mesh = mtf.Mesh(graph, "my_mesh")
	logits, loss = mnist_model(features, labels, mesh)
	
	variables = graph._all_variables
	tf.logging.info("[variable num]: {}".format(len(variables)))
	for v in variables:
		tf.logging.info("[variable] (name, shape): ({},{})".format(v.name,v.shape))

	mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
	layout_rules = mtf.auto_mtf.layout(graph, mesh_shape, [logits, loss])
	tf.logging.info("[auto mtf search] strategy: {}".format(layout_rules))
	
	mesh_size = mesh_shape.size
	mesh_devices = ["gpu:0", "gpu:1", "gpu:2", "gpu:3"]
	mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
		mesh_shape, layout_rules, mesh_devices)

	if mode == tf.estimator.ModeKeys.TRAIN:
		var_grads = mtf.gradients(
			[loss], [v.outputs[0] for v in graph.trainable_variables])
		optimizer = mtf.optimize.AdafactorOptimizer()
		update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables)

	lowering = mtf.Lowering(graph, {mesh: mesh_impl})
	restore_hook = mtf.MtfRestoreHook(lowering)

	tf_logits = lowering.export_to_tf_tensor(logits)
	if mode != tf.estimator.ModeKeys.PREDICT:
		tf_loss = lowering.export_to_tf_tensor(loss)
		tf.summary.scalar("loss", tf_loss)

	if mode == tf.estimator.ModeKeys.TRAIN:
		tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
		tf_update_ops.append(tf.assign_add(global_step, 1))
		train_op = tf.group(tf_update_ops)
		saver = tf.train.Saver(
			tf.global_variables(),
			sharded=True,
			max_to_keep=10,
			keep_checkpoint_every_n_hours=2,
			defer_build=False, save_relative_paths=True)
		tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
		saver_listener = mtf.MtfCheckpointSaverListener(lowering)
		saver_hook = tf.train.CheckpointSaverHook(
			FLAGS.model_dir,
			save_steps=1000,
			saver=saver,
			listeners=[saver_listener])

		accuracy = tf.metrics.accuracy(
			labels=labels, predictions=tf.argmax(tf_logits, axis=1))

		# Name tensors to be logged with LoggingTensorHook.
		tf.identity(tf_loss, "cross_entropy")
		tf.identity(accuracy[1], name="train_accuracy")

		# Save accuracy scalar to Tensorboard output.
		tf.summary.scalar("train_accuracy", accuracy[1])

		# restore_hook must come before saver_hook
		return tf.estimator.EstimatorSpec(
			tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op,
			training_chief_hooks=[restore_hook, saver_hook])

	if mode == tf.estimator.ModeKeys.PREDICT:
		predictions = {
			"classes": tf.argmax(tf_logits, axis=1),
			"probabilities": tf.nn.softmax(tf_logits),
		}
		return tf.estimator.EstimatorSpec(
			mode=tf.estimator.ModeKeys.PREDICT,
			predictions=predictions,
			prediction_hooks=[restore_hook],
			export_outputs={
				"classify": tf.estimator.export.PredictOutput(predictions)
			})
	if mode == tf.estimator.ModeKeys.EVAL:
		return tf.estimator.EstimatorSpec(
			mode=tf.estimator.ModeKeys.EVAL,
			loss=tf_loss,
			evaluation_hooks=[restore_hook],
			eval_metric_ops={
				"accuracy":
				tf.metrics.accuracy(
					labels=labels, predictions=tf.argmax(tf_logits, axis=1)),
			})
Ejemplo n.º 28
0
    def __call__(self, inputs, training):
        """Add operations to classify a batch of input images.

    Args:
      inputs: A Tensor representing a batch of input images.
      training: A boolean. Set to True to add operations required only when
        training the classifier.

    Returns:
      A logits Tensor with shape [<batch_size>, self.num_classes].
    """

        with self._model_variable_scope():
            if self.data_format == 'channels_first':
                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
                # This provides a large performance boost on GPU. See
                # https://www.tensorflow.org/performance/performance_guide#data_formats
                inputs = tf.transpose(a=inputs, perm=[0, 3, 1, 2])

            inputs = conv2d_fixed_padding(inputs=inputs,
                                          filters=self.num_filters,
                                          kernel_size=self.kernel_size,
                                          strides=self.conv_stride,
                                          data_format=self.data_format)
            inputs = tf.identity(inputs, 'initial_conv')

            # We do not include batch normalization or activation functions in V2
            # for the initial conv1 because the first ResNet unit will perform these
            # for both the shortcut and non-shortcut paths as part of the first
            # block's projection. Cf. Appendix of [2].
            if self.resnet_version == 1:
                inputs = batch_norm(inputs, training, self.data_format)
                inputs = tf.nn.relu(inputs)

            if self.first_pool_size:
                inputs = tf.compat.v1.layers.max_pooling2d(
                    inputs=inputs,
                    pool_size=self.first_pool_size,
                    strides=self.first_pool_stride,
                    padding='SAME',
                    data_format=self.data_format)
                inputs = tf.identity(inputs, 'initial_max_pool')

            for i, num_blocks in enumerate(self.block_sizes):
                num_filters = self.num_filters * (2**i)
                inputs = block_layer(inputs=inputs,
                                     filters=num_filters,
                                     bottleneck=self.bottleneck,
                                     block_fn=self.block_fn,
                                     blocks=num_blocks,
                                     strides=self.block_strides[i],
                                     training=training,
                                     name='block_layer{}'.format(i + 1),
                                     data_format=self.data_format)

            # Only apply the BN and ReLU for model that does pre_activation in each
            # building/bottleneck block, eg resnet V2.
            if self.pre_activation:
                inputs = batch_norm(inputs, training, self.data_format)
                inputs = tf.nn.relu(inputs)

            # The current top layer has shape
            # `batch_size x pool_size x pool_size x final_size`.
            # ResNet does an Average Pooling layer over pool_size,
            # but that is the same as doing a reduce_mean. We do a reduce_mean
            # here because it performs better than AveragePooling2D.
            axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
            inputs = tf.reduce_mean(input_tensor=inputs,
                                    axis=axes,
                                    keepdims=True)
            inputs = tf.identity(inputs, 'final_reduce_mean')

            inputs = tf.squeeze(inputs, axes)
            inputs = tf.compat.v1.layers.dense(inputs=inputs,
                                               units=self.num_classes)
            inputs = tf.identity(inputs, 'final_dense')
            return inputs
Ejemplo n.º 29
0
def compute_spectral_norm(w_tensor,
                          power_iteration_rounds=1,
                          training=True,
                          name=None):
    """Estimates the largest singular value in the weight tensor.

  **NOTE**: When `training=True`, repeatedly running inference actually changes
  the variables, since the spectral norm is repeatedly approximated by a power
  iteration method.

  Args:
    w_tensor: The weight matrix whose spectral norm should be computed.
    power_iteration_rounds: The number of iterations of the power method to
      perform. A higher number yields a better approximation.
    training: Whether to update the spectral normalization on variable
      access. This is useful to turn off during eval, for example, to not affect
      the graph during evaluation.
    name: An optional scope name.

  Returns:
    The largest singular value (the spectral norm) of w.

  Raises:
    ValueError: If TF is executing eagerly.
    ValueError: If called within a distribution strategy that is not supported.
  """
    if tf.executing_eagerly():
        # Under eager mode, get_variable() creates a new variable on every call.
        raise ValueError(
            '`compute_spectral_norm` doesn\'t work when executing eagerly.')
    with tf.variable_scope(name, 'spectral_norm'):
        # The paper says to flatten convnet kernel weights from
        # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
        # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
        # (KH * KW * C_in, C_out), and similarly for other layers that put output
        # channels as last dimension.
        # n.b. this means that w here is equivalent to w.T in the paper.
        w = tf.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))

        # Persisted approximation of first left singular vector of matrix `w`.
        # Requires an appropriate aggregation method since we explicitly control
        # updates.
        replica_context = tf.distribute.get_replica_context()
        if replica_context is None:  # cross repica strategy.
            # TODO(joelshor): Determine appropriate aggregation method.
            raise ValueError("spectral norm isn't supported in cross-replica "
                             "distribution strategy.")
        elif not tf.distribute.has_strategy():  # default strategy.
            aggregation = None
        else:
            aggregation = tf.VariableAggregation.ONLY_FIRST_REPLICA

        u_var = tf.get_variable(_PERSISTED_U_VARIABLE_SUFFIX,
                                shape=(w.shape[0], 1),
                                dtype=w.dtype,
                                initializer=tf.initializers.random_normal(),
                                trainable=False,
                                aggregation=aggregation)
        u = u_var

        # Use power iteration method to approximate spectral norm.
        for _ in range(power_iteration_rounds):
            # `v` approximates the first right singular vector of matrix `w`.
            v = tf.nn.l2_normalize(tf.matmul(a=w, b=u, transpose_a=True))
            u = tf.nn.l2_normalize(tf.matmul(w, v))

        # Update persisted approximation.
        if training:
            with tf.control_dependencies([u_var.assign(u, name='update_u')]):
                u = tf.identity(u)

        u = tf.stop_gradient(u)
        v = tf.stop_gradient(v)

        # Largest singular value of `w`.
        spectral_norm = tf.matmul(tf.matmul(a=u, b=w, transpose_a=True), v)
        spectral_norm.shape.assert_is_fully_defined()
        spectral_norm.shape.assert_is_compatible_with([1, 1])

        return spectral_norm[0][0]
Ejemplo n.º 30
0
 def mean_var_with_update():
     with tf.control_dependencies([ema_apply_op]):
         return tf.identity(batch_mean), tf.identity(batch_var)