def _generate_enqueue_op(self, inputs, name_prefix, index, device=None, tpu_ordinal=-1): """Generate a host-side Op to enqueue a tuple to the queue. If device is None the inputs are all required to have the same device specification, and the enqueue Op is colocated with inputs[0]. Otherwise the enqueue Op is placed on 'device'. Args: inputs: a list of Tensors with the types and shapes of the tuple elements. name_prefix: the base name for the Op. index: the shard index, used to uniquify the Op name. device: device to place the Op on, or None if it should be colocated with the inputs. tpu_ordinal: ordinal of the TPU device on the host to use for infeed if device is a CPU device. Should be set to -1 if device is a TPU device. Returns: An Op corresponding to a shard of infeed enqueued at the host, suitable for use within a replicated block. Raises: ValueError: if device is None and inputs do not all have the same device specification. """ full_name = "%s/%d" % (name_prefix, index) shapes = [t.shape for t in inputs] if device is None: devices = [t.device for t in inputs] for i in xrange(1, self.number_of_tuple_elements): if devices[0] != devices[i]: raise ValueError( "input devices for shard %d are %s, but should all be the same", index, str(devices)) with ops.colocate_with(inputs[0]): return tpu_ops.infeed_enqueue_tuple( inputs=inputs, shapes=shapes, name=full_name, device_ordinal=tpu_ordinal) else: with ops.device(device): return tpu_ops.infeed_enqueue_tuple( inputs=inputs, shapes=shapes, name=full_name, device_ordinal=tpu_ordinal)
def build_infeed_from_input_specs(self, input_specs, execution_mode): infeed_op = [] shard_infeed_tensors = [] for shard_id in range(self._strategy.num_towers): with ops.device('/device:CPU:0'): infeed_tensors = [] with ops.device('/device:TPU:%d' % shard_id): for spec in input_specs: # Construct placeholders for each of the inputs. infeed_tensors.append( array_ops.placeholder(dtype=spec.dtype, shape=spec.shape, name='infeed-enqueue-%s-%d' % (spec.name, shard_id))) shard_infeed_tensors.append(infeed_tensors) infeed_op.append( tpu_ops.infeed_enqueue_tuple( infeed_tensors, [spec.shape for spec in input_specs], name='infeed-enqueue-%s-%d' % (execution_mode, shard_id), device_ordinal=shard_id)) return SizedInfeed(infeed_ops=infeed_op, sharded_infeed_tensors=shard_infeed_tensors)
def infeed_input(i): """Get input, split it and then enqueue.""" batches = iterator.get_next() batches = array_ops.split(batches, 2) infeeds = [ tpu_ops.infeed_enqueue_tuple( inputs=[batches[j]], shapes=[[1, 1, 1]], device_ordinal=j) for j in range(2) ] with ops.control_dependencies(infeeds): return i + 1
def build_infeed_from_input_specs(self, input_specs, execution_mode): shard_infeed_tensors = self._get_next_ops assert len(shard_infeed_tensors) == self._strategy.num_towers infeed_ops = [] for shard_id in range(self._strategy.num_towers): with ops.device('/device:TPU:%d' % shard_id): infeed_ops.append( tpu_ops.infeed_enqueue_tuple( shard_infeed_tensors[shard_id], [spec.shape for spec in input_specs], name='infeed-enqueue-%s-%d' % (execution_mode, shard_id))) return SizedInfeed(infeed_ops=infeed_ops, sharded_infeed_tensors=shard_infeed_tensors)
def build_infeed_from_input_specs(self, input_specs, execution_mode): shard_infeed_tensors = self._get_next_ops assert len(shard_infeed_tensors) == self._strategy.num_towers infeed_ops = [] for shard_id in range(self._strategy.num_towers): with ops.device('/device:TPU:%d' % shard_id): infeed_ops.append( tpu_ops.infeed_enqueue_tuple( shard_infeed_tensors[shard_id], [spec.shape for spec in input_specs], name='infeed-enqueue-%s-%d' % (execution_mode, shard_id))) return SizedInfeed(infeed_ops=infeed_ops, sharded_infeed_tensors=shard_infeed_tensors)
def infeed_input(i): """Get input, split it and then enqueue.""" iteration_inputs = [f.get(i) for f in feeds()] infeed_inputs = [[inputs_per_core[core_id] for inputs_per_core in iteration_inputs] for core_id in range(self._num_cores_per_host)] infeed_ops = [] for core_id, infeed_input in enumerate(infeed_inputs): infeed_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=infeed_input, shapes=shapes, device_ordinal=core_id)) with ops.control_dependencies(infeed_ops): return i + 1
def infeed_input(i): """Get input, split it and then enqueue.""" iteration_inputs = [f.get(i) for f in feeds()] infeed_inputs = [[inputs_per_core[core_id] for inputs_per_core in iteration_inputs] for core_id in range(self._num_cores_per_host)] infeed_ops = [] for core_id, infeed_input in enumerate(infeed_inputs): infeed_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=infeed_input, shapes=shapes, device_ordinal=core_id)) with ops.control_dependencies(infeed_ops): return i + 1
def enqueue_ops_fn(): """Enqueue ops for one iteration.""" control_deps = [] sharded_inputs = [] with ops.device(self._host): for _ in range(self._num_cores_per_host): # Use control dependencies to ensure a deterministic ordering. with ops.control_dependencies(control_deps): inputs = nest.flatten(iterator.get_next()) control_deps.extend(inputs) sharded_inputs.append(inputs) enqueue_ops = [] for core_id, shard_input in enumerate(sharded_inputs): enqueue_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=shard_input, shapes=shapes, device_ordinal=core_id)) return enqueue_ops
def build_infeed_from_input_specs(self, input_specs, execution_mode): infeed_op = [] shard_infeed_tensors = [] for shard_id in range(self._strategy.num_towers): with ops.device('/device:TPU:%d' % shard_id): infeed_tensors = [] for spec in input_specs: # Construct placeholders for each of the inputs. infeed_tensors.append( array_ops.placeholder( dtype=spec.dtype, shape=spec.shape, name='infeed-enqueue-%s-%d' % (spec.name, shard_id))) shard_infeed_tensors.append(infeed_tensors) infeed_op.append( tpu_ops.infeed_enqueue_tuple( infeed_tensors, [spec.shape for spec in input_specs], name='infeed-enqueue-%s-%d' % (execution_mode, shard_id))) return SizedInfeed(infeed_ops=infeed_op, sharded_infeed_tensors=shard_infeed_tensors)
def _specialize_model(self, input_specs): """Specialize `self.model` (a Keras model) for the given input shapes.""" # Re-create our input and output layers inside our subgraph. They will be # attached to the true computation when we clone our model in `tpu_fn`. K.set_learning_phase( self.execution_mode == model_fn_lib.ModeKeys.TRAIN) # functools.partial and callable objects are not supported by tpu.rewrite def _model_fn(): """Compute fit/eval/predict for the TPU.""" is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT # During train/eval, we infeed our features as well as labels. if is_training or is_test: infeed_layers = self.model._input_layers + self.model._output_layers else: infeed_layers = self.model._input_layers # Generate our infeed operation to read features & labels. infeed_tensors = tpu_ops.infeed_dequeue_tuple( dtypes=[spec.dtype for spec in input_specs], shapes=[spec.shape for spec in input_specs], name='infeed-%s' % self.execution_mode) assert len(infeed_tensors) == len(infeed_layers), ( 'Infeed inputs did not match model: %s vs %s', (infeed_layers, infeed_tensors)) tpu_targets = [] tpu_inputs = [] # Sort infeed outputs into inputs and labels for calling our Keras model. for tensor, layer in zip(infeed_tensors, infeed_layers): if layer in self.model._input_layers: tpu_inputs.append( layers.Input(name=layer.name, tensor=tensor)) if layer in self.model._output_layers: tpu_targets.append(tensor) # Call our model with our infeed inputs (re-using the weights). model_outputs = self.model(tpu_inputs) child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs) if is_training or is_test: child_model.compile( optimizer=_replicated_optimizer(self.model.optimizer, self.num_replicas), loss=self.model.loss, loss_weights=self.model.loss_weights, metrics=self.model.metrics, weighted_metrics=self.model.weighted_metrics, target_tensors=tpu_targets, ) # Compute our outfeed depending on the execution mode if is_training: child_model._make_train_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.train_function.outputs ] return [ child_model.train_function.updates_op, tpu_ops.outfeed_enqueue_tuple( child_model.train_function.outputs, name='outfeed-enqueue-train') ] elif is_test: child_model._make_test_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.test_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( child_model.test_function.outputs, name='outfeed-enqueue-test') ] elif is_predict: child_model._make_predict_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.predict_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( child_model.predict_function.outputs, name='outfeed-enqueue-predict', ) ] else: assert False, 'Unexpected execution mode: %s' % self.execution_mode # Capture outfeed metadata computed during the rewrite. self._outfeed_spec = None # Generate out TPU operations using `tpu.split_compile_and_replicate`. # `compile_op` can be used to test the TPU model compiles before execution. # `execute op` replicates `_model_fn` `num_replicas` times, with each shard # running on a different logical core. compile_op, execute_op = tpu.split_compile_and_replicate( _model_fn, inputs=[[]] * self.num_replicas) # Generate CPU side operations to enqueue features/labels and dequeue # outputs from the model call. infeed_op = [] outfeed_op = [] shard_infeed_tensors = [] for shard_id in range(self.num_replicas): with ops.device('/device:TPU:%d' % shard_id): infeed_tensors = [] for spec in input_specs: infeed_tensors.append( array_ops.placeholder(dtype=spec.dtype, shape=spec.shape, name='infeed-enqueue-%s-%d' % (spec.name, shard_id))) shard_infeed_tensors.append(infeed_tensors) infeed_op.append( tpu_ops.infeed_enqueue_tuple( infeed_tensors, [spec.shape for spec in input_specs], name='infeed-enqueue-%s-%d' % (self.execution_mode, shard_id))) outfeed_op.extend( tpu_ops.outfeed_dequeue_tuple( dtypes=[spec.dtype for spec in self._outfeed_spec], shapes=[spec.shape for spec in self._outfeed_spec], name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id))) return TPUModelOp(compile_op, execute_op, infeed_tensors=shard_infeed_tensors, infeed_op=infeed_op, outfeed_op=outfeed_op)
def _specialize_model(self, input_specs): """Specialize `self.model` (a Keras model) for the given input shapes.""" # Re-create our input and output layers inside our subgraph. They will be # attached to the true computation when we clone our model in `tpu_fn`. K.set_learning_phase(self.execution_mode == model_fn_lib.ModeKeys.TRAIN) # functools.partial and callable objects are not supported by tpu.rewrite def _model_fn(): """Compute fit/eval/predict for the TPU.""" is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT # During train/eval, we infeed our features as well as labels. if is_training or is_test: infeed_layers = self.model._input_layers + self.model._output_layers else: infeed_layers = self.model._input_layers # Generate our infeed operation to read features & labels. infeed_tensors = tpu_ops.infeed_dequeue_tuple( dtypes=[spec.dtype for spec in input_specs], shapes=[spec.shape for spec in input_specs], name='infeed-%s' % self.execution_mode) assert len(infeed_tensors) == len(infeed_layers), ( 'Infeed inputs did not match model: %s vs %s', (infeed_layers, infeed_tensors)) tpu_targets = [] tpu_input_map = {} # Sort infeed outputs into inputs and labels for calling our Keras model. for tensor, layer in zip(infeed_tensors, infeed_layers): if layer in self.model._input_layers: tpu_input_map[layer.name] = tensor if layer in self.model._output_layers: tpu_targets.append(tensor) # Clone our CPU model, running within the TPU device context. with TPURewriteContext(tpu_input_map): self._cloned_model = models.clone_model(self.model) # Create a copy of the optimizer for this graph. if isinstance(self.model.optimizer, keras_optimizers.TFOptimizer): cloned_optimizer = keras_optimizers.TFOptimizer( self.model.optimizer.optimizer) else: logging.info('Cloning %s %s', self.model.optimizer.__class__.__name__, self._optimizer_config) cloned_optimizer = self.model.optimizer.__class__.from_config( self._optimizer_config) if is_training or is_test: self._cloned_model.compile( optimizer=_replicated_optimizer(cloned_optimizer), loss=self.model.loss, loss_weights=self.model.loss_weights, metrics=self.model.metrics, weighted_metrics=self.model.weighted_metrics, target_tensors=tpu_targets, ) # Compute our outfeed depending on the execution mode if is_training: self._cloned_model._make_train_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.train_function.outputs ] return [ self._cloned_model.train_function.updates_op, tpu_ops.outfeed_enqueue_tuple( self._cloned_model.train_function.outputs, name='outfeed-enqueue-train') ] elif is_test: self._cloned_model._make_test_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.test_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( self._cloned_model.test_function.outputs, name='outfeed-enqueue-test') ] elif is_predict: self._cloned_model._make_predict_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.predict_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( self._cloned_model.predict_function.outputs, name='outfeed-enqueue-predict', ) ] else: assert False, 'Unexpected execution mode: %s' % self.execution_mode # Capture outfeed metadata computed during the rewrite. self._outfeed_spec = None # Generate out TPU operations using `tpu.split_compile_and_replicate`. # `compile_op` can be used to test the TPU model compiles before execution. # `execute op` replicates `_model_fn` `num_replicas` times, with each shard # running on a different logical core. compile_op, execute_op = tpu.split_compile_and_replicate( _model_fn, inputs=[[]] * self._strategy.num_towers) # Generate CPU side operations to enqueue features/labels and dequeue # outputs from the model call. infeed_op = [] outfeed_op = [] shard_infeed_tensors = [] for shard_id in range(self._strategy.num_towers): with ops.device('/device:TPU:%d' % shard_id): infeed_tensors = [] for spec in input_specs: infeed_tensors.append( array_ops.placeholder( dtype=spec.dtype, shape=spec.shape, name='infeed-enqueue-%s-%d' % (spec.name, shard_id))) shard_infeed_tensors.append(infeed_tensors) infeed_op.append( tpu_ops.infeed_enqueue_tuple( infeed_tensors, [spec.shape for spec in input_specs], name='infeed-enqueue-%s-%d' % (self.execution_mode, shard_id))) outfeed_op.extend( tpu_ops.outfeed_dequeue_tuple( dtypes=[spec.dtype for spec in self._outfeed_spec], shapes=[spec.shape for spec in self._outfeed_spec], name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id))) return TPUModelOp( compile_op, execute_op, infeed_tensors=shard_infeed_tensors, infeed_op=infeed_op, outfeed_op=outfeed_op)
def generate_enqueue_ops(self, per_host_sharded_inputs): """Generates the host-side Ops to enqueue the partitioned inputs. per_host_sharded_inputs is a list, one for each replica, of lists of Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed replica i. sharded_inputs[i][j] is partitioned by self._input_partition_dims[j]. For example, if sharded_inputs[i][j] is a 2-D Tensor: [[A, B, C, D], [E ,F, G, H]] self._input_partition_dims[j] is [2, 4]. sharded_inputs[i][j] will be partitioned and flattened into: [A, B, C, D, E, F, G, H] and fed into the logical core ids: [0, 1, 2, 3, 4, 5, 6, 7] respectively. Args: per_host_sharded_inputs: a list of lists of Tensors. The length of the outer list determines the number of shards. Each inner list indicates the types and shapes of the tuples in the corresponding shard. Returns: A list of host-side Ops, one for each shard, that when executed together will enqueue a full-size element of infeed. Raises: ValueError: if the queue configuration has previously been frozen and the shapes of the elements of sharded_inputs are not compatible with the frozen configuration; or if the shapes of the elements of sharded_inputs don't form a consistent unsharded tuple; or if the elements of a tuple have different device constraints; or if the partition dims are invalid. TypeError: if the queue configuration has previously been frozen and the types of the elements of sharded_inputs are not compatible with the frozen configuration; or if the types of the elements of sharded_inputs don't form a consistent unsharded tuple. """ self.set_configuration_from_sharded_input_tensors( per_host_sharded_inputs) number_of_replicas_per_host = len(per_host_sharded_inputs) number_of_tuple_elements = len(per_host_sharded_inputs[0]) assert len(self._input_partition_dims) == number_of_tuple_elements per_host_enqueue_ops = [] for replica_index in range(number_of_replicas_per_host): flattened_inputs = per_host_sharded_inputs[replica_index] inputs_part_dims_flat = nest.flatten_up_to( flattened_inputs, self._input_partition_dims) inputs_parted_iters = [ iter(self._partition_or_replicate_on_host(x, dims)) for x, dims in zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat) ] for logical_core in xrange( self._device_assignment.num_cores_per_replica): # Places different partitions to different logic cores. replica_id = self._device_assignment.lookup_replicas( self._host_id, logical_core)[replica_index] ordinal = self._device_assignment.tpu_ordinal( replica=replica_id, logical_core=logical_core) infeed_inputs = [] for it in inputs_parted_iters: input_for_device = next(it, None) if input_for_device is not None: infeed_inputs.append(input_for_device) if infeed_inputs: per_host_enqueue_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=infeed_inputs, shapes=[x.shape for x in infeed_inputs], name="enqueue/replica_{0}/input_{1}".format( replica_index, logical_core), device_ordinal=ordinal)) return per_host_enqueue_ops
def _specialize_model(self, input_specs): """Specialize `self.model` (a Keras model) for the given input shapes.""" # Re-create our input and output layers inside our subgraph. They will be # attached to the true computation when we clone our model in `tpu_fn`. K.set_learning_phase( self.execution_mode == model_fn_lib.ModeKeys.TRAIN) # functools.partial and callable objects are not supported by tpu.rewrite def _model_fn(): """Compute fit/eval/predict for the TPU.""" is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT # During train/eval, we infeed our features as well as labels. if is_training or is_test: infeed_layers = self.model._input_layers + self.model._output_layers else: infeed_layers = self.model._input_layers # Generate our infeed operation to read features & labels. infeed_tensors = tpu_ops.infeed_dequeue_tuple( dtypes=[spec.dtype for spec in input_specs], shapes=[spec.shape for spec in input_specs], name='infeed-%s' % self.execution_mode) assert len(infeed_tensors) == len(infeed_layers), ( 'Infeed inputs did not match model: %s vs %s', (infeed_layers, infeed_tensors)) tpu_targets = [] tpu_inputs = [] # Sort infeed outputs into inputs and labels for calling our Keras model. for tensor, layer in zip(infeed_tensors, infeed_layers): if layer in self.model._input_layers: tpu_inputs.append( layers.Input(name=layer.name, tensor=tensor)) if layer in self.model._output_layers: tpu_targets.append(tensor) optimizer = self.model.optimizer optimizer.iterations = training_util.get_or_create_global_step() # Call our model with our infeed inputs (re-using the weights). model_outputs = self.model(tpu_inputs) child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs) if is_training or is_test: child_model.compile( optimizer=self.model.optimizer, loss=self.model.loss, loss_weights=self.model.loss_weights, metrics=self.model.metrics, weighted_metrics=self.model.weighted_metrics, target_tensors=tpu_targets, ) # Compute our outfeed depending on the execution mode if is_training: child_model._make_train_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.train_function.outputs ] return [ child_model.train_function.updates_op, tpu_ops.outfeed_enqueue_tuple( child_model.train_function.outputs, name='oufeed-enqueue-train') ] elif is_test: child_model._make_test_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.test_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( child_model.test_function.outputs, name='outfeed-enqueue-test') ] elif is_predict: child_model._make_predict_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.predict_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( child_model.predict_function.outputs, name='outfeed-enqueue-predict', ) ] else: assert False, 'Unexpected execution mode: %s' % self.execution_mode # Capture outfeed metadata computed during the rewrite. self._outfeed_spec = None tpu_execute_op = tpu.rewrite(_model_fn) K._initialize_variables( K.get_session()) # pylint-disable: protected-access # Generate CPU side operations to enqueue features/labels and dequeue # outputs from the model call. with ops.device('/device:TPU:0'): infeed_tensors = [] for spec in input_specs: infeed_tensors.append( array_ops.placeholder(dtype=spec.dtype, shape=spec.shape, name='infeed-enqueue-%s' % spec.name)) infeed_op = tpu_ops.infeed_enqueue_tuple( infeed_tensors, [spec.shape for spec in input_specs], name='infeed-enqueue-%s' % self.execution_mode) outfeed_op = tpu_ops.outfeed_dequeue_tuple( dtypes=[spec.dtype for spec in self._outfeed_spec], shapes=[spec.shape for spec in self._outfeed_spec], name='outfeed-dequeue-%s' % self.execution_mode) return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op)
def _specialize_model(self, input_specs): """Specialize `self.model` (a Keras model) for the given input shapes.""" # Re-create our input and output layers inside our subgraph. They will be # attached to the true computation when we clone our model in `tpu_fn`. K.set_learning_phase( self.execution_mode == model_fn_lib.ModeKeys.TRAIN ) # functools.partial and callable objects are not supported by tpu.rewrite def _model_fn(): """Compute fit/eval/predict for the TPU.""" is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT # During train/eval, we infeed our features as well as labels. if is_training or is_test: infeed_layers = self.model._input_layers + self.model._output_layers else: infeed_layers = self.model._input_layers # Generate our infeed operation to read features & labels. infeed_tensors = tpu_ops.infeed_dequeue_tuple( dtypes=[spec.dtype for spec in input_specs], shapes=[spec.shape for spec in input_specs], name='infeed-%s' % self.execution_mode) assert len(infeed_tensors) == len(infeed_layers), ( 'Infeed inputs did not match model: %s vs %s', (infeed_layers, infeed_tensors)) tpu_targets = [] tpu_inputs = [] # Sort infeed outputs into inputs and labels for calling our Keras model. for tensor, layer in zip(infeed_tensors, infeed_layers): if layer in self.model._input_layers: tpu_inputs.append(layers.Input(name=layer.name, tensor=tensor)) if layer in self.model._output_layers: tpu_targets.append(tensor) # Call our model with our infeed inputs (re-using the weights). model_outputs = self.model(tpu_inputs) child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs) if is_training or is_test: child_model.compile( optimizer=self.model.optimizer, loss=self.model.loss, loss_weights=self.model.loss_weights, metrics=self.model.metrics, weighted_metrics=self.model.weighted_metrics, target_tensors=tpu_targets, ) # Compute our outfeed depending on the execution mode if is_training: child_model._make_train_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.train_function.outputs ] return [ child_model.train_function.updates_op, tpu_ops.outfeed_enqueue_tuple( child_model.train_function.outputs, name='oufeed-enqueue-train') ] elif is_test: child_model._make_test_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.test_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( child_model.test_function.outputs, name='outfeed-enqueue-test') ] elif is_predict: child_model._make_predict_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in child_model.predict_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( child_model.predict_function.outputs, name='outfeed-enqueue-predict', ) ] else: assert False, 'Unexpected execution mode: %s' % self.execution_mode # Capture outfeed metadata computed during the rewrite. self._outfeed_spec = None tpu_execute_op = tpu.rewrite(_model_fn) # Generate CPU side operations to enqueue features/labels and dequeue # outputs from the model call. with ops.device('/device:TPU:0'): infeed_tensors = [] for spec in input_specs: infeed_tensors.append( array_ops.placeholder( dtype=spec.dtype, shape=spec.shape, name='infeed-enqueue-%s' % spec.name)) infeed_op = tpu_ops.infeed_enqueue_tuple( infeed_tensors, [spec.shape for spec in input_specs], name='infeed-enqueue-%s' % self.execution_mode) outfeed_op = tpu_ops.outfeed_dequeue_tuple( dtypes=[spec.dtype for spec in self._outfeed_spec], shapes=[spec.shape for spec in self._outfeed_spec], name='outfeed-dequeue-%s' % self.execution_mode) return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op)
def generate_enqueue_ops(self, per_host_sharded_inputs): """Generates the host-side Ops to enqueue the partitioned inputs. per_host_sharded_inputs is a list, one for each replica, of lists of Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed replica i. sharded_inputs[i][j] is partitioned by self._input_partition_dims[j]. For example, if sharded_inputs[i][j] is a 2-D Tensor: [[A, B, C, D], [E ,F, G, H]] self._input_partition_dims[j] is [2, 4]. sharded_inputs[i][j] will be partitioned and flattened into: [A, B, C, D, E, F, G, H] and fed into the logical core ids: [0, 1, 2, 3, 4, 5, 6, 7] respectively. Args: per_host_sharded_inputs: a list of lists of Tensors. The length of the outer list determines the number of shards. Each inner list indicates the types and shapes of the tuples in the corresponding shard. Returns: A list of host-side Ops, one for each shard, that when executed together will enqueue a full-size element of infeed. Raises: ValueError: if the queue configuration has previously been frozen and the shapes of the elements of sharded_inputs are not compatible with the frozen configuration; or if the shapes of the elements of sharded_inputs don't form a consistent unsharded tuple; or if the elements of a tuple have different device constraints; or if the partition dims are invalid. TypeError: if the queue configuration has previously been frozen and the types of the elements of sharded_inputs are not compatible with the frozen configuration; or if the types of the elements of sharded_inputs don't form a consistent unsharded tuple. """ self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs) number_of_replicas_per_host = len(per_host_sharded_inputs) number_of_tuple_elements = len(per_host_sharded_inputs[0]) assert len(self._input_partition_dims) == number_of_tuple_elements per_host_enqueue_ops = [] for replica_index in range(number_of_replicas_per_host): flattened_inputs = per_host_sharded_inputs[replica_index] inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs, self._input_partition_dims) inputs_parted_iters = [ iter(self._partition_or_replicate_on_host(x, dims)) for x, dims in zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat) ] for core_index in xrange(self._device_assignment.num_cores_per_replica): # Places different partitions to different logic cores. logical_core = self._get_logical_core(core_index) replica_id = self._device_assignment.lookup_replicas( self._host_id, logical_core)[replica_index] ordinal = self._device_assignment.tpu_ordinal( replica=replica_id, logical_core=logical_core) infeed_inputs = [] for it in inputs_parted_iters: input_for_device = next(it, None) if input_for_device is not None: infeed_inputs.append(input_for_device) if infeed_inputs: per_host_enqueue_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=infeed_inputs, shapes=[x.shape for x in infeed_inputs], name="enqueue/replica_{0}/input_{1}".format( replica_index, core_index), device_ordinal=ordinal)) return per_host_enqueue_ops