def ModuleFn(training): """Builds the graph and signature for the stub TF-hub module.""" image_data = tf.placeholder( shape=[None, input_image_height, input_image_width, 3], dtype=tf.float32) # Linearly project image_data to shape [1, output_feature_dim] features. encoder_output = tf.compat.v1.layers.dense( tf.reshape(image_data, [-1, input_image_height * input_image_width * 3]), output_feature_dim) # Add a non-trainable 'count' variable that can be updated through an # UPDATE_OP. This is analogous to a batch-norm moving average that should be # updated during fine-tuning. v = tf.get_variable('count', initializer=0, dtype=tf.int32, trainable=False) if training: update_op = v.assign_add(1).op tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op) hub.add_signature('default', inputs={'images': image_data}, outputs=encoder_output)
def _TpuEmbLookup(self) -> Dict[str, tf.Tensor]: """TPU Embedding lookup.""" activations = self._tpu_embedding.get_activations() task = py_utils.GetTaskCallScope() # We expect either None (if this is the first call) or a single item in a # list. tpu_embedding_activations = tf.get_collection( py_utils.TPU_EMBEDDING_ACTIVATIONS) if not tpu_embedding_activations: # Create a dict from task -> activations dict. tpu_embedding_activations_dict = {} tpu_embedding_activations_dict[task] = activations tf.add_to_collection(py_utils.TPU_EMBEDDING_ACTIVATIONS, tpu_embedding_activations_dict) else: # This is a subsequent call, so the dictionary already exists. tpu_embedding_activations_dict = tpu_embedding_activations[0] tpu_embedding_activations_dict[task] = activations ret = py_utils.NestedMap() for k, v in activations.items(): if k in self._sequence_features: ret[k] = v else: # Non-sequence embeddings, we fill the "time" dimension with 1. ret[k] = tf.expand_dims(v, axis=[1]) return ret
def Get(cls): """Returns the TpuEmbeddingCollection associated with the current graph.""" emb_collection = tf.get_collection(cls.GRAPH_COLLECTION_NAME) assert len(emb_collection) <= 1 if len(emb_collection) == 1: tf.logging.info( 'TpuEmbeddingCollection singleton already exists, reusing') return emb_collection[0] else: singleton = cls() tf.add_to_collection(cls.GRAPH_COLLECTION_NAME, singleton) return singleton
def TpuEmbLookup(ids_map): """TPU Embedding lookup.""" del ids_map activations = self._tpu_embedding.get_activations() tf.add_to_collection(py_utils.TPU_EMBEDDING_ACTIVATIONS, activations) ret = py_utils.NestedMap() for k, v in activations.items(): if k in self._sequence_features: ret[k] = v else: # Non-sequence embeddings, we fill the "time" dimension with 1. ret[k] = tf.expand_dims(v, axis=[1]) return ret
def apply_mask(x, scope=''): """Apply mask to a given weight tensor. Args: x: Input weight tensor scope: The current variable scope. Defaults to "". Returns: Tensor representing masked_weights """ mask = pruning_utils.weight_mask_variable(x, scope) threshold = pruning_utils.weight_threshold_variable(x, scope) # Add masked_weights in the weights namescope so as to make it easier # for the quantization library to add quant ops. masked_weights = tf.multiply(mask, x, _MASKED_WEIGHT_NAME) # Make sure the mask for a given variable are not added multiple times to the # collection. This is particularly important when applying mask to RNN's # weight variables if mask not in tf.get_collection_ref(_MASK_COLLECTION): tf.add_to_collection(_THRESHOLD_COLLECTION, threshold) tf.add_to_collection(_MASK_COLLECTION, mask) tf.add_to_collection(_MASKED_WEIGHT_COLLECTION, masked_weights) tf.add_to_collection(_WEIGHT_COLLECTION, x) return masked_weights
def _CreateSaveable(self, opaque_params, input_dim, cell_dim, direction, scope): rnn_cell_name = 'rnn_cell' if direction == UNIDIR: saveable = cudnn_rnn_utils.CuDNNLSTMSaveable( opaque_params, cell_dim, input_dim, rnn_cell_name, scope, opaque_params.name + '_saveable') else: fwd_cell_name = 'fwd' bak_cell_name = 'bak' saveable = cudnn_rnn_utils.BidiCuDNNLSTMSaveable( opaque_params, cell_dim, input_dim, fwd_cell_name, bak_cell_name, scope, opaque_params.name + '_saveable') tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable) return saveable
def _CreateLayerVariables(self): super()._CreateLayerVariables() p = self.params load_op_list = [] retrieve_op_list = [] # At the feature level, track which are associated # with "sequence embeddings". self._sequence_features = {} if py_utils.use_tpu(): num_cores = self.cluster.params.worker.tpus_per_replica global_batch_size = (self.params.batch_size * self.cluster.num_splits_per_client) table_to_config_dict = {} feature_to_config_dict = {} for table in self.tables: table_to_config_dict[table.table_name] = table.table_config load_op_list += table.load_op_list retrieve_op_list += table.retrieve_op_list for feature in table.input_keys: if table.max_sequence_length > 0: self._sequence_features[feature] = True feature_to_config_dict[ feature] = tpu_embedding_lib.FeatureConfig( table.table_name, max_sequence_length=table.max_sequence_length) tf.logging.info('adding load and retrieve ops to collection.') tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list) tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS, retrieve_op_list) tpu_embedding_collection = tf.get_collection( py_utils.TPU_EMBEDDING) assert len(tpu_embedding_collection) <= 1 if len(tpu_embedding_collection) == 1: tf.logging.info( 'TPUEmbedding API singleton already exists, reusing') self._tpu_embedding = tpu_embedding_collection[0] else: mode = tpu_embedding_lib.TRAINING device_config = tpu_embedding_lib.DeviceConfig( num_cores=num_cores, num_hosts=self.params.tables[0].num_tpu_hosts, job_name=self.cluster.params.worker.name) self._tpu_embedding = tpu_embedding_lib.TPUEmbedding( table_to_config_dict, feature_to_config_dict, global_batch_size, mode, master=None, pipeline_execution_with_tensor_core=( self.params.pipeline_execution_with_tensor_core), partition_strategy=p.partition_strategy, device_config=device_config) tf.add_to_collection(py_utils.TPU_EMBEDDING, self._tpu_embedding)
def _BuildTpuEmbeddingApi(): load_op_list = [] retrieve_op_list = [] num_cores = self.cluster.params.worker.tpus_per_replica global_batch_size = (self.params.batch_size * self.cluster.num_splits_per_client) table_to_config_dict = {} feature_to_config_dict = {} for table in self.tables: table_to_config_dict[table.table_name] = table.table_config load_op_list += table.load_op_list retrieve_op_list += table.retrieve_op_list for feature in table.input_keys: feature_to_config_dict[ feature] = tpu_embedding_lib.FeatureConfig( table.table_name, max_sequence_length=table.max_sequence_length) mode = tpu_embedding_lib.TRAINING device_config = tpu_embedding_lib.DeviceConfig( num_cores=num_cores, num_hosts=self.params.tables[0].num_tpu_hosts, job_name=self.cluster.params.worker.name) tpu_embedding = tpu_embedding_lib.TPUEmbedding( table_to_config_dict, feature_to_config_dict, global_batch_size, mode, master=None, pipeline_execution_with_tensor_core=( self.params.pipeline_execution_with_tensor_core), partition_strategy=p.partition_strategy, device_config=device_config) with tf.init_scope(): dummy_variables, dummy_variables_init = ( tpu_embedding_gradient.create_dummy_table_variables( tpu_embedding)) load_op_list += [dummy_variables_init] tf.add_to_collection(py_utils.TPU_EMBEDDING, tpu_embedding) tf.add_to_collection(py_utils.TPU_EMBEDDING_DUMMY_VARS, dummy_variables) tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list) tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS, retrieve_op_list)
def CreateTpuFeeds(self): """Creates the TPU infeed queue from preprocessed batch.""" p = self.params cluster = self.cluster num_tpu_hosts = cluster.num_tpu_hosts num_cores_per_host = cluster.total_worker_devices // num_tpu_hosts tf.logging.info( 'CreateTPUFeeds num_splits_per_client={} ' 'num_devices_per_split={} num_tpu_hosts={} use_per_host_infeed={}'. format(cluster.num_splits_per_client, cluster.num_devices_per_split, num_tpu_hosts, p.use_per_host_infeed)) assert num_tpu_hosts > 0, ('num_tpu_hosts: %d' % num_tpu_hosts) if (cluster.num_devices_per_split > num_cores_per_host and p.use_per_host_infeed): tf.logging.fatal( 'Doesn\'t support per host infeed mode when ' 'num_devices_per_split({}) > num_cores_per_host({})'.format( cluster.num_devices_per_split, num_cores_per_host)) num_infeed_hosts = num_tpu_hosts if p.use_per_host_infeed else 1 with py_utils.outside_all_rewrites(): assert py_utils.use_tpu() assert not self._made_tpu_infeed shards = tpu_function.get_tpu_context( ).number_of_shards // num_infeed_hosts tf.logging.info('shards {}'.format(shards)) input_ops_list = [] queues = [] tpu_embedding_collection = tf.get_collection( py_utils.TPU_EMBEDDING) tpu_embedding = (tpu_embedding_collection[0] if tpu_embedding_collection else None) if num_tpu_hosts > 1 and tpu_embedding is not None: if not p.use_per_host_infeed: tf.logging.fatal( 'TPU Embedding must be used with per_host_infeed with multiple ' 'TPU host topologies.') tpu_emb_input_keys = (list( tpu_embedding.feature_to_config_dict.keys()) if tpu_embedding is not None else []) tf.logging.info('tpu_emb_input_keys: %r', tpu_emb_input_keys) batch = None for task_id in range(num_infeed_hosts): host_device = '/task:{}/device:CPU:0'.format(task_id) with tf.device(host_device): batch = self.GetPreprocessedInputBatch() if isinstance(batch, py_utils.NestedMap): # Hack: bucket_keys and xxx.bucket_keys are not needed on TPU. # Note that when MultiTaskData is used, bucket_keys will be at the # second level of the dictionary. batch = batch.FilterKeyVal( lambda k, _: not k.endswith('bucket_keys')) tf.logging.info('host_device: %s, batch: %r', host_device, batch) if tpu_embedding is not None: enqueue_dict_per_core = [ {} for _ in range(tpu_embedding.num_cores_per_host) ] num_cores_per_host = tpu_embedding.num_cores_per_host for key in tpu_emb_input_keys: feat = batch[key] tpu_emb_feat_splitted = tf.split( feat, num_cores_per_host) for core, split in enumerate( tpu_emb_feat_splitted): # Dense to sparse. Note the assumption of a padding id. sample_indices = tf.where( tf.not_equal(split, -1)) embedding_indices = tf.gather_nd( split, sample_indices) enqueue_data = tpu_embedding_lib.EnqueueData( embedding_indices, sample_indices) enqueue_dict_per_core[core][key] = enqueue_data input_ops_list += tpu_embedding.generate_enqueue_ops( enqueue_dict_per_core) for k, x in batch.FlattenItems(): assert x.shape.is_fully_defined(), ( 'Shape must be fully defined: %s: %s' % (k, x)) # TODO(cwhipkey): if it's a string (or other type not supported on # TPU), drop it from feeding and on the other end add in an op that # fails if used. shapes = batch.Transform(lambda x: x.shape).Flatten() dtypes = batch.Transform(lambda x: x.dtype).Flatten() tf.logging.info('host_device: %s infeed shapes: %r', host_device, shapes) tf.logging.info('host_device: %s infeed dtypes: %r', host_device, dtypes) if p.use_partitioned_infeed_queue: device_assignment = py_utils.GetTpuDeviceAssignment() host_device = device_assignment.host_device( replica=0, job=tf.flags.FLAGS.tf_master) host_id = int( host_device.split('/task:')[1].split('/device:') [0]) tf.logging.info('host_id: {} host_device: {}'.format( host_id, host_device)) q = tpu_feed._PartitionedInfeedQueue( # pylint: disable=protected-access number_of_tuple_elements=len(dtypes), device_assignment=device_assignment, host_id=host_id, input_partition_dims=[[p.num_partitions, 1] for _ in dtypes], tuple_types=dtypes, tuple_shapes=shapes) else: q = tpu_feed.InfeedQueue(tuple_types=dtypes, tuple_shapes=shapes) assert shards is not None q.set_number_of_shards(shards) queues.append(q) tf.logging.info('q=%r', q) if p.use_partitioned_infeed_queue: input_ops = q.generate_enqueue_ops([batch.Flatten()]) elif p.use_per_host_infeed: # TODO(ylc/zhifengc): Add this to a policy module and test it. def TPUOrdinalFunction(shard_index_in_host): device_assignment = py_utils.GetTpuDeviceAssignment( ) if device_assignment: # We put both enqueue/dequeue ops at core 0 in each replica. replica = device_assignment.lookup_replicas( task_id, 0)[shard_index_in_host] # pylint: disable=cell-var-from-loop return device_assignment.tpu_ordinal( replica=replica) else: return shard_index_in_host input_ops = q.split_inputs_and_generate_enqueue_ops( batch.Flatten(), placement_function=lambda x: host_device, # pylint: disable=cell-var-from-loop tpu_ordinal_function=TPUOrdinalFunction) else: input_ops = q.split_inputs_and_generate_enqueue_ops( batch.Flatten(), device_assignment=py_utils.GetTpuDeviceAssignment( )) input_ops_list += input_ops tf.logging.info('input_ops_list %s', input_ops_list) tpu_infeed_op = tf.group(*input_ops_list) self._made_tpu_infeed = True # Let trainer.py use multiple threads to drive the infeed op. for _ in range(p.tpu_infeed_parallelism): tf.add_to_collection(py_utils.ENQUEUE_OPS, tpu_infeed_op) self._tpu_infeed_op = tpu_infeed_op with tf.device(tf.tpu.core(0)): tensors = queues[0].generate_dequeue_op() return batch.Pack(tensors)
def _AddTpuEmbeddingSummaryTensor(name, value, weight=1.0): tf.add_to_collection(py_utils.TPU_EMBEDDING_SUMMARY_TENSORS, (name, value, tf.convert_to_tensor(weight)))
def CreateTpuFeeds(self): """Creates the TPU infeed queue from preprocessed batch.""" p = self.params cluster = self.cluster num_tpu_hosts = cluster.num_tpu_hosts num_cores_per_host = cluster.total_worker_devices // num_tpu_hosts tf.logging.info('num_cores_per_host {}'.format(num_cores_per_host)) tf.logging.info('num_devices_per_split {}'.format( cluster.num_devices_per_split)) assert num_tpu_hosts > 0, ('num_tpu_hosts: %d' % num_tpu_hosts) if (cluster.num_devices_per_split > num_cores_per_host and p.use_per_host_infeed): tf.logging.fatal( 'Doesn\'t support per host infeed mode when ' 'num_devices_per_split({}) > num_cores_per_host({})'.format( cluster.num_devices_per_split, num_cores_per_host)) num_infeed_hosts = num_tpu_hosts if p.use_per_host_infeed else 1 with py_utils.outside_all_rewrites(): assert py_utils.use_tpu() assert not self._made_tpu_infeed shards = tpu_function.get_tpu_context( ).number_of_shards // num_infeed_hosts input_ops_list = [] queues = [] tpu_embedding_collection = tf.get_collection( py_utils.TPU_EMBEDDING) tpu_embedding = (tpu_embedding_collection[0] if tpu_embedding_collection else None) tpu_emb_input_keys = (list( tpu_embedding.feature_to_config_dict.keys()) if tpu_embedding is not None else []) tf.logging.info('tpu_emb_input_keys: %r', tpu_emb_input_keys) batch = None for task_id in range(num_infeed_hosts): host_device = '/task:{}/device:CPU:0'.format(task_id) with tf.device(host_device): batch = self.GetPreprocessedInputBatch() if 'bucket_keys' in batch: # Hack: bucket_keys are not needed on TPU. del batch['bucket_keys'] tf.logging.info('host_device: %s, batch: %r', host_device, batch) if tpu_embedding is not None: enqueue_dict_per_core = [ {} for _ in range(tpu_embedding.num_cores_per_host) ] num_cores_per_host = tpu_embedding.num_cores_per_host for key in tpu_emb_input_keys: feat = batch[key] tpu_emb_feat_splitted = tf.split( feat, num_cores_per_host) for core, split in enumerate( tpu_emb_feat_splitted): # Dense to sparse. Note the assumption of a padding id. sample_indices = tf.where( tf.not_equal(split, -1)) embedding_indices = tf.gather_nd( split, sample_indices) enqueue_data = tpu_embedding_lib.EnqueueData( embedding_indices, sample_indices) enqueue_dict_per_core[core][key] = enqueue_data input_ops_list += tpu_embedding.generate_enqueue_ops( enqueue_dict_per_core) for k, x in batch.FlattenItems(): assert x.shape.is_fully_defined(), ( 'Shape must be fully defined: %s: %s' % (k, x)) # TODO(cwhipkey): if it's a string (or other type not supported on # TPU), drop it from feeding and on the other end add in an op that # fails if used. shapes = batch.Transform(lambda x: x.shape).Flatten() dtypes = batch.Transform(lambda x: x.dtype).Flatten() tf.logging.info('host_device: %s infeed shapes: %r', host_device, shapes) tf.logging.info('host_device: %s infeed dtypes: %r', host_device, dtypes) q = tpu_feed.InfeedQueue(tuple_types=dtypes, tuple_shapes=shapes) queues.append(q) assert shards is not None q.set_number_of_shards(shards) if p.use_per_host_infeed: # TODO(ylc/zhifengc): Add this to a policy module and test it. def TPUOrdinalFunction(shard_index_in_host): device_assignment = py_utils.GetTpuDeviceAssignment( ) if device_assignment: # We put both enqueue/dequeue ops at core 0 in each replica. replica = device_assignment.lookup_replicas( task_id, 0)[shard_index_in_host] # pylint: disable=cell-var-from-loop return device_assignment.tpu_ordinal( replica=replica) else: return shard_index_in_host input_ops = q.split_inputs_and_generate_enqueue_ops( batch.Flatten(), placement_function=lambda x: host_device, # pylint: disable=cell-var-from-loop tpu_ordinal_function=TPUOrdinalFunction) else: input_ops = q.split_inputs_and_generate_enqueue_ops( batch.Flatten(), device_assignment=py_utils.GetTpuDeviceAssignment( )) input_ops_list += input_ops tf.logging.info('input_ops_list %s', input_ops_list) tpu_infeed_op = tf.group(*input_ops_list) self._made_tpu_infeed = True # Let trainer.py use multiple threads to drive the infeed op. for _ in range(p.tpu_infeed_parallelism): tf.add_to_collection(py_utils.ENQUEUE_OPS, tpu_infeed_op) # For executor-driven multiple programs, we need more fine-grained # access rather than using a single global graph collection. self.tpu_infeed_op = tpu_infeed_op with tf.device(tf.tpu.core(0)): tensors = queues[0].generate_dequeue_op() return batch.Pack(tensors)
def AddToPruningCollections(weight, mask, threshold, gradient=None, old_weight=None, old_old_weight=None): """Add mask, threshold, and weight vars to their respective collections.""" if mask not in tf.get_collection(pruning.MASK_COLLECTION): tf.add_to_collection(pruning.WEIGHT_COLLECTION, weight) tf.add_to_collection(pruning.MASK_COLLECTION, mask) tf.add_to_collection(pruning.THRESHOLD_COLLECTION, threshold) # Add gradient, old_weight, and old_old_weight to collections approximating # gradient and hessian, where old_weight is the weight tensor one step # before and old_old_weight is the weight tensor two steps before. if gradient is not None: assert old_weight is not None assert old_old_weight is not None tf.add_to_collection(pruning.WEIGHT_GRADIENT_COLLECTION, gradient) tf.add_to_collection(pruning.OLD_WEIGHT_COLLECTION, old_weight) tf.add_to_collection(pruning.OLD_OLD_WEIGHT_COLLECTION, old_old_weight)