Example #1
0
    def ModuleFn(training):
        """Builds the graph and signature for the stub TF-hub module."""
        image_data = tf.placeholder(
            shape=[None, input_image_height, input_image_width, 3],
            dtype=tf.float32)
        # Linearly project image_data to shape [1, output_feature_dim] features.
        encoder_output = tf.compat.v1.layers.dense(
            tf.reshape(image_data,
                       [-1, input_image_height * input_image_width * 3]),
            output_feature_dim)

        # Add a non-trainable 'count' variable that can be updated through an
        # UPDATE_OP. This is analogous to a batch-norm moving average that should be
        # updated during fine-tuning.
        v = tf.get_variable('count',
                            initializer=0,
                            dtype=tf.int32,
                            trainable=False)
        if training:
            update_op = v.assign_add(1).op
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op)

        hub.add_signature('default',
                          inputs={'images': image_data},
                          outputs=encoder_output)
Example #2
0
    def _TpuEmbLookup(self) -> Dict[str, tf.Tensor]:
        """TPU Embedding lookup."""
        activations = self._tpu_embedding.get_activations()
        task = py_utils.GetTaskCallScope()
        # We expect either None (if this is the first call) or a single item in a
        # list.
        tpu_embedding_activations = tf.get_collection(
            py_utils.TPU_EMBEDDING_ACTIVATIONS)
        if not tpu_embedding_activations:
            # Create a dict from task -> activations dict.
            tpu_embedding_activations_dict = {}
            tpu_embedding_activations_dict[task] = activations
            tf.add_to_collection(py_utils.TPU_EMBEDDING_ACTIVATIONS,
                                 tpu_embedding_activations_dict)
        else:
            # This is a subsequent call, so the dictionary already exists.
            tpu_embedding_activations_dict = tpu_embedding_activations[0]
            tpu_embedding_activations_dict[task] = activations

        ret = py_utils.NestedMap()
        for k, v in activations.items():
            if k in self._sequence_features:
                ret[k] = v
            else:
                # Non-sequence embeddings, we fill the "time" dimension with 1.
                ret[k] = tf.expand_dims(v, axis=[1])
        return ret
Example #3
0
 def Get(cls):
   """Returns the TpuEmbeddingCollection associated with the current graph."""
   emb_collection = tf.get_collection(cls.GRAPH_COLLECTION_NAME)
   assert len(emb_collection) <= 1
   if len(emb_collection) == 1:
     tf.logging.info(
         'TpuEmbeddingCollection singleton already exists, reusing')
     return emb_collection[0]
   else:
     singleton = cls()
     tf.add_to_collection(cls.GRAPH_COLLECTION_NAME, singleton)
     return singleton
Example #4
0
 def TpuEmbLookup(ids_map):
     """TPU Embedding lookup."""
     del ids_map
     activations = self._tpu_embedding.get_activations()
     tf.add_to_collection(py_utils.TPU_EMBEDDING_ACTIVATIONS,
                          activations)
     ret = py_utils.NestedMap()
     for k, v in activations.items():
         if k in self._sequence_features:
             ret[k] = v
         else:
             # Non-sequence embeddings, we fill the "time" dimension with 1.
             ret[k] = tf.expand_dims(v, axis=[1])
     return ret
Example #5
0
def apply_mask(x, scope=''):
    """Apply mask to a given weight tensor.

  Args:
    x: Input weight tensor
    scope: The current variable scope. Defaults to "".
  Returns:
    Tensor representing masked_weights
  """

    mask = pruning_utils.weight_mask_variable(x, scope)
    threshold = pruning_utils.weight_threshold_variable(x, scope)
    # Add masked_weights in the weights namescope so as to make it easier
    # for the quantization library to add quant ops.
    masked_weights = tf.multiply(mask, x, _MASKED_WEIGHT_NAME)

    # Make sure the mask for a given variable are not added multiple times to the
    # collection. This is particularly important when applying mask to RNN's
    # weight variables
    if mask not in tf.get_collection_ref(_MASK_COLLECTION):
        tf.add_to_collection(_THRESHOLD_COLLECTION, threshold)
        tf.add_to_collection(_MASK_COLLECTION, mask)
        tf.add_to_collection(_MASKED_WEIGHT_COLLECTION, masked_weights)
        tf.add_to_collection(_WEIGHT_COLLECTION, x)
    return masked_weights
Example #6
0
 def _CreateSaveable(self, opaque_params, input_dim, cell_dim, direction,
                     scope):
     rnn_cell_name = 'rnn_cell'
     if direction == UNIDIR:
         saveable = cudnn_rnn_utils.CuDNNLSTMSaveable(
             opaque_params, cell_dim, input_dim, rnn_cell_name, scope,
             opaque_params.name + '_saveable')
     else:
         fwd_cell_name = 'fwd'
         bak_cell_name = 'bak'
         saveable = cudnn_rnn_utils.BidiCuDNNLSTMSaveable(
             opaque_params, cell_dim, input_dim, fwd_cell_name,
             bak_cell_name, scope, opaque_params.name + '_saveable')
     tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable)
     return saveable
    def _CreateLayerVariables(self):
        super()._CreateLayerVariables()
        p = self.params

        load_op_list = []
        retrieve_op_list = []

        # At the feature level, track which are associated
        # with "sequence embeddings".
        self._sequence_features = {}

        if py_utils.use_tpu():
            num_cores = self.cluster.params.worker.tpus_per_replica
            global_batch_size = (self.params.batch_size *
                                 self.cluster.num_splits_per_client)
            table_to_config_dict = {}
            feature_to_config_dict = {}
            for table in self.tables:
                table_to_config_dict[table.table_name] = table.table_config
                load_op_list += table.load_op_list
                retrieve_op_list += table.retrieve_op_list
                for feature in table.input_keys:
                    if table.max_sequence_length > 0:
                        self._sequence_features[feature] = True
                    feature_to_config_dict[
                        feature] = tpu_embedding_lib.FeatureConfig(
                            table.table_name,
                            max_sequence_length=table.max_sequence_length)
            tf.logging.info('adding load and retrieve ops to collection.')
            tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS,
                                 retrieve_op_list)

            tpu_embedding_collection = tf.get_collection(
                py_utils.TPU_EMBEDDING)
            assert len(tpu_embedding_collection) <= 1
            if len(tpu_embedding_collection) == 1:
                tf.logging.info(
                    'TPUEmbedding API singleton already exists, reusing')
                self._tpu_embedding = tpu_embedding_collection[0]
            else:
                mode = tpu_embedding_lib.TRAINING
                device_config = tpu_embedding_lib.DeviceConfig(
                    num_cores=num_cores,
                    num_hosts=self.params.tables[0].num_tpu_hosts,
                    job_name=self.cluster.params.worker.name)
                self._tpu_embedding = tpu_embedding_lib.TPUEmbedding(
                    table_to_config_dict,
                    feature_to_config_dict,
                    global_batch_size,
                    mode,
                    master=None,
                    pipeline_execution_with_tensor_core=(
                        self.params.pipeline_execution_with_tensor_core),
                    partition_strategy=p.partition_strategy,
                    device_config=device_config)
                tf.add_to_collection(py_utils.TPU_EMBEDDING,
                                     self._tpu_embedding)
Example #8
0
        def _BuildTpuEmbeddingApi():
            load_op_list = []
            retrieve_op_list = []

            num_cores = self.cluster.params.worker.tpus_per_replica
            global_batch_size = (self.params.batch_size *
                                 self.cluster.num_splits_per_client)
            table_to_config_dict = {}
            feature_to_config_dict = {}
            for table in self.tables:
                table_to_config_dict[table.table_name] = table.table_config
                load_op_list += table.load_op_list
                retrieve_op_list += table.retrieve_op_list
                for feature in table.input_keys:
                    feature_to_config_dict[
                        feature] = tpu_embedding_lib.FeatureConfig(
                            table.table_name,
                            max_sequence_length=table.max_sequence_length)

            mode = tpu_embedding_lib.TRAINING
            device_config = tpu_embedding_lib.DeviceConfig(
                num_cores=num_cores,
                num_hosts=self.params.tables[0].num_tpu_hosts,
                job_name=self.cluster.params.worker.name)
            tpu_embedding = tpu_embedding_lib.TPUEmbedding(
                table_to_config_dict,
                feature_to_config_dict,
                global_batch_size,
                mode,
                master=None,
                pipeline_execution_with_tensor_core=(
                    self.params.pipeline_execution_with_tensor_core),
                partition_strategy=p.partition_strategy,
                device_config=device_config)

            with tf.init_scope():
                dummy_variables, dummy_variables_init = (
                    tpu_embedding_gradient.create_dummy_table_variables(
                        tpu_embedding))
            load_op_list += [dummy_variables_init]

            tf.add_to_collection(py_utils.TPU_EMBEDDING, tpu_embedding)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_DUMMY_VARS,
                                 dummy_variables)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list)
            tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS,
                                 retrieve_op_list)
Example #9
0
    def CreateTpuFeeds(self):
        """Creates the TPU infeed queue from preprocessed batch."""
        p = self.params
        cluster = self.cluster
        num_tpu_hosts = cluster.num_tpu_hosts
        num_cores_per_host = cluster.total_worker_devices // num_tpu_hosts
        tf.logging.info(
            'CreateTPUFeeds num_splits_per_client={} '
            'num_devices_per_split={} num_tpu_hosts={} use_per_host_infeed={}'.
            format(cluster.num_splits_per_client,
                   cluster.num_devices_per_split, num_tpu_hosts,
                   p.use_per_host_infeed))

        assert num_tpu_hosts > 0, ('num_tpu_hosts: %d' % num_tpu_hosts)
        if (cluster.num_devices_per_split > num_cores_per_host
                and p.use_per_host_infeed):
            tf.logging.fatal(
                'Doesn\'t support per host infeed mode when '
                'num_devices_per_split({}) > num_cores_per_host({})'.format(
                    cluster.num_devices_per_split, num_cores_per_host))
        num_infeed_hosts = num_tpu_hosts if p.use_per_host_infeed else 1

        with py_utils.outside_all_rewrites():
            assert py_utils.use_tpu()
            assert not self._made_tpu_infeed

            shards = tpu_function.get_tpu_context(
            ).number_of_shards // num_infeed_hosts
            tf.logging.info('shards {}'.format(shards))

            input_ops_list = []
            queues = []
            tpu_embedding_collection = tf.get_collection(
                py_utils.TPU_EMBEDDING)
            tpu_embedding = (tpu_embedding_collection[0]
                             if tpu_embedding_collection else None)

            if num_tpu_hosts > 1 and tpu_embedding is not None:
                if not p.use_per_host_infeed:
                    tf.logging.fatal(
                        'TPU Embedding must be used with per_host_infeed with multiple '
                        'TPU host topologies.')
            tpu_emb_input_keys = (list(
                tpu_embedding.feature_to_config_dict.keys())
                                  if tpu_embedding is not None else [])
            tf.logging.info('tpu_emb_input_keys: %r', tpu_emb_input_keys)

            batch = None
            for task_id in range(num_infeed_hosts):
                host_device = '/task:{}/device:CPU:0'.format(task_id)
                with tf.device(host_device):
                    batch = self.GetPreprocessedInputBatch()
                    if isinstance(batch, py_utils.NestedMap):
                        # Hack: bucket_keys and xxx.bucket_keys are not needed on TPU.
                        # Note that when MultiTaskData is used, bucket_keys will be at the
                        # second level of the dictionary.
                        batch = batch.FilterKeyVal(
                            lambda k, _: not k.endswith('bucket_keys'))
                    tf.logging.info('host_device: %s, batch: %r', host_device,
                                    batch)

                    if tpu_embedding is not None:
                        enqueue_dict_per_core = [
                            {} for _ in range(tpu_embedding.num_cores_per_host)
                        ]
                        num_cores_per_host = tpu_embedding.num_cores_per_host
                        for key in tpu_emb_input_keys:
                            feat = batch[key]
                            tpu_emb_feat_splitted = tf.split(
                                feat, num_cores_per_host)
                            for core, split in enumerate(
                                    tpu_emb_feat_splitted):
                                # Dense to sparse. Note the assumption of a padding id.
                                sample_indices = tf.where(
                                    tf.not_equal(split, -1))
                                embedding_indices = tf.gather_nd(
                                    split, sample_indices)
                                enqueue_data = tpu_embedding_lib.EnqueueData(
                                    embedding_indices, sample_indices)
                                enqueue_dict_per_core[core][key] = enqueue_data
                        input_ops_list += tpu_embedding.generate_enqueue_ops(
                            enqueue_dict_per_core)

                    for k, x in batch.FlattenItems():
                        assert x.shape.is_fully_defined(), (
                            'Shape must be fully defined: %s: %s' % (k, x))
                        # TODO(cwhipkey): if it's a string (or other type not supported on
                        # TPU), drop it from feeding and on the other end add in an op that
                        # fails if used.
                    shapes = batch.Transform(lambda x: x.shape).Flatten()
                    dtypes = batch.Transform(lambda x: x.dtype).Flatten()
                    tf.logging.info('host_device: %s infeed shapes: %r',
                                    host_device, shapes)
                    tf.logging.info('host_device: %s infeed dtypes: %r',
                                    host_device, dtypes)
                    if p.use_partitioned_infeed_queue:
                        device_assignment = py_utils.GetTpuDeviceAssignment()

                        host_device = device_assignment.host_device(
                            replica=0, job=tf.flags.FLAGS.tf_master)
                        host_id = int(
                            host_device.split('/task:')[1].split('/device:')
                            [0])
                        tf.logging.info('host_id: {} host_device: {}'.format(
                            host_id, host_device))
                        q = tpu_feed._PartitionedInfeedQueue(  # pylint: disable=protected-access
                            number_of_tuple_elements=len(dtypes),
                            device_assignment=device_assignment,
                            host_id=host_id,
                            input_partition_dims=[[p.num_partitions, 1]
                                                  for _ in dtypes],
                            tuple_types=dtypes,
                            tuple_shapes=shapes)
                    else:
                        q = tpu_feed.InfeedQueue(tuple_types=dtypes,
                                                 tuple_shapes=shapes)
                        assert shards is not None
                        q.set_number_of_shards(shards)

                    queues.append(q)
                    tf.logging.info('q=%r', q)

                    if p.use_partitioned_infeed_queue:
                        input_ops = q.generate_enqueue_ops([batch.Flatten()])
                    elif p.use_per_host_infeed:
                        # TODO(ylc/zhifengc): Add this to a policy module and test it.
                        def TPUOrdinalFunction(shard_index_in_host):
                            device_assignment = py_utils.GetTpuDeviceAssignment(
                            )
                            if device_assignment:
                                # We put both enqueue/dequeue ops at core 0 in each replica.
                                replica = device_assignment.lookup_replicas(
                                    task_id, 0)[shard_index_in_host]  # pylint: disable=cell-var-from-loop
                                return device_assignment.tpu_ordinal(
                                    replica=replica)
                            else:
                                return shard_index_in_host

                        input_ops = q.split_inputs_and_generate_enqueue_ops(
                            batch.Flatten(),
                            placement_function=lambda x: host_device,  # pylint: disable=cell-var-from-loop
                            tpu_ordinal_function=TPUOrdinalFunction)
                    else:
                        input_ops = q.split_inputs_and_generate_enqueue_ops(
                            batch.Flatten(),
                            device_assignment=py_utils.GetTpuDeviceAssignment(
                            ))

                    input_ops_list += input_ops
            tf.logging.info('input_ops_list %s', input_ops_list)
            tpu_infeed_op = tf.group(*input_ops_list)
        self._made_tpu_infeed = True
        # Let trainer.py use multiple threads to drive the infeed op.
        for _ in range(p.tpu_infeed_parallelism):
            tf.add_to_collection(py_utils.ENQUEUE_OPS, tpu_infeed_op)

        self._tpu_infeed_op = tpu_infeed_op

        with tf.device(tf.tpu.core(0)):
            tensors = queues[0].generate_dequeue_op()
        return batch.Pack(tensors)
Example #10
0
def _AddTpuEmbeddingSummaryTensor(name, value, weight=1.0):
    tf.add_to_collection(py_utils.TPU_EMBEDDING_SUMMARY_TENSORS,
                         (name, value, tf.convert_to_tensor(weight)))
Example #11
0
    def CreateTpuFeeds(self):
        """Creates the TPU infeed queue from preprocessed batch."""
        p = self.params
        cluster = self.cluster
        num_tpu_hosts = cluster.num_tpu_hosts
        num_cores_per_host = cluster.total_worker_devices // num_tpu_hosts
        tf.logging.info('num_cores_per_host {}'.format(num_cores_per_host))
        tf.logging.info('num_devices_per_split {}'.format(
            cluster.num_devices_per_split))

        assert num_tpu_hosts > 0, ('num_tpu_hosts: %d' % num_tpu_hosts)
        if (cluster.num_devices_per_split > num_cores_per_host
                and p.use_per_host_infeed):
            tf.logging.fatal(
                'Doesn\'t support per host infeed mode when '
                'num_devices_per_split({}) > num_cores_per_host({})'.format(
                    cluster.num_devices_per_split, num_cores_per_host))
        num_infeed_hosts = num_tpu_hosts if p.use_per_host_infeed else 1

        with py_utils.outside_all_rewrites():
            assert py_utils.use_tpu()
            assert not self._made_tpu_infeed

            shards = tpu_function.get_tpu_context(
            ).number_of_shards // num_infeed_hosts
            input_ops_list = []
            queues = []
            tpu_embedding_collection = tf.get_collection(
                py_utils.TPU_EMBEDDING)
            tpu_embedding = (tpu_embedding_collection[0]
                             if tpu_embedding_collection else None)

            tpu_emb_input_keys = (list(
                tpu_embedding.feature_to_config_dict.keys())
                                  if tpu_embedding is not None else [])
            tf.logging.info('tpu_emb_input_keys: %r', tpu_emb_input_keys)

            batch = None
            for task_id in range(num_infeed_hosts):
                host_device = '/task:{}/device:CPU:0'.format(task_id)
                with tf.device(host_device):
                    batch = self.GetPreprocessedInputBatch()
                    if 'bucket_keys' in batch:
                        # Hack: bucket_keys are not needed on TPU.
                        del batch['bucket_keys']
                    tf.logging.info('host_device: %s, batch: %r', host_device,
                                    batch)

                    if tpu_embedding is not None:
                        enqueue_dict_per_core = [
                            {} for _ in range(tpu_embedding.num_cores_per_host)
                        ]
                        num_cores_per_host = tpu_embedding.num_cores_per_host
                        for key in tpu_emb_input_keys:
                            feat = batch[key]
                            tpu_emb_feat_splitted = tf.split(
                                feat, num_cores_per_host)
                            for core, split in enumerate(
                                    tpu_emb_feat_splitted):
                                # Dense to sparse. Note the assumption of a padding id.
                                sample_indices = tf.where(
                                    tf.not_equal(split, -1))
                                embedding_indices = tf.gather_nd(
                                    split, sample_indices)
                                enqueue_data = tpu_embedding_lib.EnqueueData(
                                    embedding_indices, sample_indices)
                                enqueue_dict_per_core[core][key] = enqueue_data
                        input_ops_list += tpu_embedding.generate_enqueue_ops(
                            enqueue_dict_per_core)

                    for k, x in batch.FlattenItems():
                        assert x.shape.is_fully_defined(), (
                            'Shape must be fully defined: %s: %s' % (k, x))
                        # TODO(cwhipkey): if it's a string (or other type not supported on
                        # TPU), drop it from feeding and on the other end add in an op that
                        # fails if used.
                    shapes = batch.Transform(lambda x: x.shape).Flatten()
                    dtypes = batch.Transform(lambda x: x.dtype).Flatten()
                    tf.logging.info('host_device: %s infeed shapes: %r',
                                    host_device, shapes)
                    tf.logging.info('host_device: %s infeed dtypes: %r',
                                    host_device, dtypes)
                    q = tpu_feed.InfeedQueue(tuple_types=dtypes,
                                             tuple_shapes=shapes)
                    queues.append(q)
                    assert shards is not None
                    q.set_number_of_shards(shards)

                    if p.use_per_host_infeed:

                        # TODO(ylc/zhifengc): Add this to a policy module and test it.
                        def TPUOrdinalFunction(shard_index_in_host):
                            device_assignment = py_utils.GetTpuDeviceAssignment(
                            )
                            if device_assignment:
                                # We put both enqueue/dequeue ops at core 0 in each replica.
                                replica = device_assignment.lookup_replicas(
                                    task_id, 0)[shard_index_in_host]  # pylint: disable=cell-var-from-loop
                                return device_assignment.tpu_ordinal(
                                    replica=replica)
                            else:
                                return shard_index_in_host

                        input_ops = q.split_inputs_and_generate_enqueue_ops(
                            batch.Flatten(),
                            placement_function=lambda x: host_device,  # pylint: disable=cell-var-from-loop
                            tpu_ordinal_function=TPUOrdinalFunction)
                    else:
                        input_ops = q.split_inputs_and_generate_enqueue_ops(
                            batch.Flatten(),
                            device_assignment=py_utils.GetTpuDeviceAssignment(
                            ))

                    input_ops_list += input_ops
            tf.logging.info('input_ops_list %s', input_ops_list)
            tpu_infeed_op = tf.group(*input_ops_list)
        self._made_tpu_infeed = True
        # Let trainer.py use multiple threads to drive the infeed op.
        for _ in range(p.tpu_infeed_parallelism):
            tf.add_to_collection(py_utils.ENQUEUE_OPS, tpu_infeed_op)

        # For executor-driven multiple programs, we need more fine-grained
        # access rather than using a single global graph collection.
        self.tpu_infeed_op = tpu_infeed_op

        with tf.device(tf.tpu.core(0)):
            tensors = queues[0].generate_dequeue_op()
        return batch.Pack(tensors)
Example #12
0
def AddToPruningCollections(weight,
                            mask,
                            threshold,
                            gradient=None,
                            old_weight=None,
                            old_old_weight=None):
    """Add mask, threshold, and weight vars to their respective collections."""
    if mask not in tf.get_collection(pruning.MASK_COLLECTION):
        tf.add_to_collection(pruning.WEIGHT_COLLECTION, weight)
        tf.add_to_collection(pruning.MASK_COLLECTION, mask)
        tf.add_to_collection(pruning.THRESHOLD_COLLECTION, threshold)

        # Add gradient, old_weight, and old_old_weight to collections approximating
        # gradient and hessian, where old_weight is the weight tensor one step
        # before and old_old_weight is the weight tensor two steps before.
        if gradient is not None:
            assert old_weight is not None
            assert old_old_weight is not None
            tf.add_to_collection(pruning.WEIGHT_GRADIENT_COLLECTION, gradient)
            tf.add_to_collection(pruning.OLD_WEIGHT_COLLECTION, old_weight)
            tf.add_to_collection(pruning.OLD_OLD_WEIGHT_COLLECTION,
                                 old_old_weight)