def set_weights(distribution_strategy, dist_model, weights): """Sets the weights of the replicated models. The weights of the replicated models are set to the weights of the original model. The weights of the replicated model are Mirrored variables and hence we need to use the `update` call within a DistributionStrategy scope. Args: distribution_strategy: DistributionStrategy used to distribute training and validation. dist_model: The replicated models on the different devices. weights: The weights of the original model. """ assign_ops = [] for layer in dist_model.layers: num_param = len(layer.weights) layer_weights = weights[:num_param] for sw, w in zip(layer.weights, layer_weights): if ops.executing_eagerly_outside_functions(): sw.assign(w) else: assign_ops.append(distribution_strategy.unwrap(sw.assign(w))) weights = weights[num_param:] if not ops.executing_eagerly_outside_functions(): K.get_session(assign_ops).run(assign_ops)
def setup_tpu_session(master): """Initializes and returns a Keras/TF session connected the TPU `master`.""" session = tf_session.Session( target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) K.set_session(session) K.get_session().run(tpu.initialize_system()) return session
def _get_var_for_numpy(distribution_strategy, input_array): """Creates a variable and assigns the value of the numpy array to it. Args: distribution_strategy: The DistributionStrategy used to compile the model. input_array: The input numpy array whose value will be assigned to the variable we create. Returns: The variable to which we will copy the value of the input numpy array. """ with ops.device(get_cpu_device(distribution_strategy)): # Create and initialize a variable on the CPU device. This is the CPU # device of the host in the case of TPUDistributionStrategy. input_var = variables.VariableV1(array_ops.zeros(input_array.shape, input_array.dtype), trainable=False, use_resource=True) K.get_session().run(input_var.initializer) # Create a placeholder for the numpy array input slices. We copy the value # of the input numpy array to the variable in slices of size 64 MB to avoid # running into memory issues or RPC message limits. start_placeholder = array_ops.placeholder(dtypes.int64, ()) end_placeholder = array_ops.placeholder(dtypes.int64, ()) slice_placeholder = array_ops.placeholder(input_var.dtype) assign_slice_op = input_var[start_placeholder:end_placeholder].assign( slice_placeholder) # If each batch element is > 64 MB, then we copy each batch element # individually. Otherwise, the slices will be < 128 MB. There might be padding # which might mean that the slices are 128 MB even if the size of the # tensor allocated is less than 128 MB. # This formula gives slices with size: # ceil(64 MB / byte size per batch element) bytes. # Using ceil() guarantees we get a number >= 1. # Calculate the size of each batch element. byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \ input_var.dtype.size # Calculate number of elements we want to copy per slice. batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element) # Copy slices of the above size starting at 0, except the last slice will be # smaller. start = 0 limit = input_array.shape[0] while start < limit: end = min(start + batch_size_per_slice, limit) K.get_session().run(assign_slice_op, feed_dict={ start_placeholder: start, end_placeholder: end, slice_placeholder: input_array[start:end]}) start = end return input_var
def _init_writer(self): """Sets file writer.""" if context.executing_eagerly(): self.writer = summary_ops_v2.create_file_writer(self.log_dir) elif self.write_graph: self.writer = tf_summary.FileWriter(self.log_dir, K.get_session().graph) else: self.writer = tf_summary.FileWriter(self.log_dir)
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self.sess = K.get_session() # only make histogram summary op if it hasn't already been made if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf_summary.histogram(mapped_weight_name, weight) if self.write_images: w_img = array_ops.squeeze(weight) shape = K.int_shape(w_img) if len(shape) == 2: # dense layer kernel case if shape[0] > shape[1]: w_img = array_ops.transpose(w_img) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1]) elif len(shape) == 3: # convnet case if K.image_data_format() == 'channels_last': # switch to channels_first to display # every kernel as a separate image w_img = array_ops.transpose(w_img, perm=[2, 0, 1]) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1]) elif len(shape) == 1: # bias case w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1]) else: # not possible to handle 3D convnets etc. continue shape = K.int_shape(w_img) assert len(shape) == 4 and shape[-1] in [1, 3, 4] tf_summary.image(mapped_weight_name, w_img) if self.write_grads: for weight in layer.trainable_weights: mapped_weight_name = weight.name.replace(':', '_') grads = model.optimizer.get_gradients(model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [grad.values if is_indexed_slices(grad) else grad for grad in grads] tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads) if hasattr(layer, 'output'): tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = self._writer_class(self.log_dir, self.sess.graph) else: self.writer = self._writer_class(self.log_dir)
def __call__(self, inputs): assert isinstance(inputs, list) # Strip sample weight from inputs if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or self.execution_mode == model_fn_lib.ModeKeys.EVAL): input_tensors = self.model._feed_inputs + self.model._feed_targets inputs = inputs[:len(input_tensors)] else: input_tensors = self.model._feed_inputs shard_inputs = self._split_tensors(inputs) del inputs # To avoid accident usage. # Compute an input specification (used to generate infeed enqueue and # dequeue operations). We use the shape from our input array and the # dtype from our model. A user may pass in a float64 for a float32 # input: for model compatibility we still must generate a float32 infeed. input_specs = [] # We use the shape and dtype from the first shard to compute the input # metadata (`input_specs`); all replicas have the same type and shape. for tensor, ary in zip(input_tensors, shard_inputs[0]): input_specs.append( tensor_spec.TensorSpec(ary.shape, tensor.dtype, _valid_name(tensor.name))) # XLA requires every operation in the graph has a fixed shape. To # handle varying batch sizes we recompile a new sub-graph for each # unique input shape. shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs]) if shape_key not in self._compilation_cache: logging.info('New input shapes; (re-)compiling: mode=%s, %s', self.execution_mode, input_specs) new_tpu_model_ops = self._specialize_model(input_specs) self._compilation_cache[shape_key] = new_tpu_model_ops self._test_model_compiles(new_tpu_model_ops) tpu_model_ops = self._compilation_cache[shape_key] infeed_dict = {} for infeed_tensors, inputs in zip(tpu_model_ops.infeed_tensors, shard_inputs): for tensor, value in zip(infeed_tensors, inputs): infeed_dict[tensor] = value session = K.get_session() _, _, outfeed_outputs = session.run([ tpu_model_ops.infeed_op, tpu_model_ops.execute_op, tpu_model_ops.outfeed_op ], infeed_dict) # TODO(xiejw): Decide how to reduce outputs, or just discard all but first. return outfeed_outputs[:len(outfeed_outputs) // self.num_replicas]
def tpu_session(self): """Yields a TPU session and sets it as the default Keras session.""" with self._graph.as_default(): default_session = K.get_session() # N.B. We have to call `K.set_session()` AND set our session as the # TF default. `K.get_session()` surprisingly does not return the value # supplied by K.set_session otherwise. K.set_session(self._session) with self._session.as_default(): yield self._session K.set_session(default_session)
def setup_tpu_session(tpu_name_or_address): """Initializes and returns a Keras/TF session connected the TPU `master`. Args: tpu_name_or_address: A string that is either the name of the Cloud TPU, the grpc address of the Cloud TPU, or (Googlers only) the BNS name of the Cloud TPU. If tpu_name_or_address is None, the TPUClusterResolver will examine the environment to determine a potential Cloud TPU to use. Returns: A `tf.Session`. """ cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) cluster_spec = cluster_resolver.cluster_spec() session = tf_session.Session( target=cluster_resolver.master(), config=config_pb2.ConfigProto( isolate_session_state=True)) if cluster_spec: session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) K.set_session(session) K.get_session().run(tpu.initialize_system()) return session
def _test_model_compiles(self, tpu_model_ops): """Verifies that the given TPUModelOp can be compiled via XLA.""" logging.info('Started compiling') start_time = time.clock() result = K.get_session().run(tpu_model_ops.compile_op) proto = tpu_compilation_result.CompilationResultProto() proto.ParseFromString(result) if proto.status_error_message: raise RuntimeError('Compilation failed: {}'.format( proto.status_error_message)) end_time = time.clock() logging.info('Finished compiling. Time elapsed: %s secs', end_time - start_time)
def shutdown_tpu_session(session=None): """Shutdown the TPU attached to session. This should be called to cleanly shut down the TPU system before the client exits. Args: session: Session to shutdown, or None to use the default session. Returns: """ if session is None: session = K.get_session() session.run(tpu.shutdown_system())
def _experimental_fit_loop( model, iterator, epochs=100, initial_epoch=0, steps_per_epoch=None): """fit function when using TPU DistributionStrategy for training. Arguments: model: Keras Model instance. iterator: Iterator that returns inputs and targets epochs: Number of times to iterate over the data initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ current_strategy = model._distribution_strategy # TODO(priyag): Add validation that shapes are fully defined for TPU case. # TODO(priyag, sourabhbajaj): This should be moved into a callback instead. K.get_session().run(current_strategy.initialize()) def _per_device_train_function(model): model._make_train_function() return (model.train_function.inputs, model.train_function.outputs, model.train_function.updates_op, model.train_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(1) def step_fn(ctx, inputs, targets): """Clones the model and calls make_train_function.""" # TODO(priyag, sourabhbajaj): Should cache this keyed on input shapes. clone_model_on_towers( model, current_strategy, make_callback_model=True, inputs=inputs, targets=targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_tower( _per_device_train_function, model._grouped_model) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args, with_loss_tensor=True) combined_fn = K.Function( all_inputs, all_outputs, updates=all_updates, name='distributed_train_function', **all_session_args) # TODO(priyag, sourabhbajaj): Perhaps the aggregation type needs to be # something else for different outputs. out_labels = model.metrics_names or [] for label, output in zip(out_labels, combined_fn.outputs): ctx.set_last_step_output(label, output, aggregation=distribute_lib.get_loss_reduction()) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors): initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag, sourabhbajaj): Adjust steps_per_run appropriately based on # steps_per_epoch and number of epochs. ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=current_strategy.steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model)[0] distributed_training_utils.set_weights( current_strategy, distributed_model, orig_model_weights) assert steps_per_epoch is not None # TODO(priyag, sourabhbajaj): Add callbacks support. # TODO(priyag, sourabhbajaj): Add validation. for epoch in range(initial_epoch, epochs): for step_index in range( 0, steps_per_epoch, current_strategy.steps_per_run): try: _, outs = K.get_session().run([train_op, output_tensors]) # TODO(priyag, sourabhbajaj): Remove this logging in favor of proper # summaries through callbacks. print('Epoch: {}, step_index: {}, loss: {}'.format( epoch, step_index, outs['loss'])) for label, out in outs.items(): print(label, ': ', out) except errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break # Copy the weights back from the replicated model to the original model. with current_strategy.scope(): updated_weights = current_strategy.unwrap( model._grouped_model)[0].get_weights() model.set_weights(updated_weights) K.get_session().run(current_strategy.finalize())
default="model_data/yolo_anchors.txt") parser.add_argument('-m', '--model', type=str, help='Load a pretrained model from h5 file', default="model_data/yolo.h5") parser.add_argument('-i', '--image_file', type=str, help='Test image file', default="images/test.jpg") args = parser.parse_args() if __name__ == '__main__': sess = K.get_session() class_names = read_classes(args.classes) anchors = read_anchors(args.anchors) image_shape = tf.constant([720., 1280.]) yolo_model = load_model(args.model) yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names)) boxes, scores, classes = yolo_eval(yolo_outputs, image_shape) image, image_data = preprocess_image("images/" + args.image_file, model_image_size=(608, 608)) out_scores, out_boxes, out_classes = sess.run([scores, boxes, classes], feed_dict={ yolo_model.input: image_data,
def on_epoch_end(self, epoch, logs=None): """Checks if summary ops should run next epoch, logs scalar summaries.""" # don't output batch_size and # batch number as TensorBoard summaries logs = {('epoch_' + k): v for k, v in logs.items() if k not in ['batch', 'size', 'num_steps']} if self.update_freq == 'epoch': step = epoch else: step = self._samples_seen self._write_custom_summaries(step, logs) # pop the histogram summary op after each epoch if self.histogram_freq: # pylint: disable=protected-access if self.merged in self.model.test_function.fetches: self.model.test_function.fetches.remove(self.merged) if self.merged in self.model.test_function.fetch_callbacks: self.model.test_function.fetch_callbacks.pop(self.merged) # pylint: enable=protected-access if self.embeddings_data is None and self.embeddings_freq: raise ValueError('To visualize embeddings, embeddings_data must ' 'be provided.') if self.embeddings_freq and self.embeddings_data is not None: if epoch % self.embeddings_freq == 0: # We need a second forward-pass here because we're passing # the `embeddings_data` explicitly. This design allows to pass # arbitrary data as `embeddings_data` and results from the fact # that we need to know the size of the `tf.Variable`s which # hold the embeddings in `set_model`. At this point, however, # the `validation_data` is not yet set. embeddings_data = self.embeddings_data n_samples = embeddings_data[0].shape[0] i = 0 sess = K.get_session() while i < n_samples: step = min(self.batch_size, n_samples - i) batch = slice(i, i + step) if isinstance(self.model.input, list): feed_dict = { model_input: embeddings_data[idx][batch] for idx, model_input in enumerate(self.model.input) } else: feed_dict = {self.model.input: embeddings_data[0][batch]} feed_dict.update({self.batch_id: i, self.step: step}) if not isinstance(K.learning_phase(), int): feed_dict[K.learning_phase()] = False sess.run(self.assign_embeddings, feed_dict=feed_dict) self.saver.save(sess, os.path.join(self.log_dir, 'keras_embedding.ckpt'), epoch) i += self.batch_size
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU tf.distribute.Strategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.abc.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN current_strategy = model._distribution_strategy iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) steps_per_run = K.variable(value=iteration_value, dtype='int32', name='steps_per_run') # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. iterator = dist_utils.get_iterator(dataset, current_strategy) scope = dist_utils.distributed_scope(strategy=current_strategy, learning_phase=1) scope.__enter__() out_labels = model.metrics_names or [] step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for m in model._get_training_eval_metrics(): tensor = m.result() initial_loop_values[m.name] = array_ops.zeros(tensor.shape, tensor.dtype) ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: dist_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks(callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. steps_to_run = ( [current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append(steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) callbacks._call_begin_hook(mode) initial_epoch = model._maybe_load_initial_epoch_from_ckpt( initial_epoch, mode) for epoch in range(initial_epoch, epochs): dist_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] batch_logs = { 'batch': step_index, 'size': 1, 'num_steps': step_count } callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: K.get_session().run(steps_per_run.assign(step_count)) prev_step_count = step_count try: _, outputs = K.batch_get_value([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation( validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. dist_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def _set_weights_v1(self, weights): feed_dict = {} for idx, tensor in enumerate(weights): feed_dict[self._placeholder_tensors[idx]] = tensor backend.get_session().run(self._assign_op, feed_dict)
def auc(y_true, y_pred): auc = tf.metrics.auc(y_true, y_pred)[1] K.get_session().run(tf.local_variables_initializer()) return auc
def experimental_predict_loop(model, iterator, verbose=0, steps=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ current_strategy = model._distribution_strategy scope = current_strategy.scope() scope.__enter__() # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(0) def _per_device_predict_function(model): model._make_predict_function() return (model.predict_function.inputs, model.predict_function.outputs, model.predict_function.updates_op, model.predict_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_predict_function.""" if model._compile_distribution: clone_model_on_replicas(model, current_strategy, make_callback_model=False, inputs=inputs, mode=ModeKeys.PREDICT) else: _build_distributed_network(model, current_strategy, inputs, mode=ModeKeys.PREDICT) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_predict_function, args=(model._distributed_model_predict,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_predict_function', **all_session_args) for label, output in zip(model.output_names, combined_fn.outputs): ctx.set_last_step_output(label, output) return combined_fn.updates_op # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: with current_strategy.scope(): _copy_weights_to_distributed_model( model, model._distributed_model_predict) with current_strategy.scope(): _reset_metrics(model, model._distributed_model_predict) assert steps is not None # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] for step in range(steps): _, batch_outs = K.get_session().run([predict_op, output_tensors]) # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) if verbose >= 1: progbar.update(step + 1) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: return np.concatenate(unconcatenated_outs[0], axis=0) return [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ]
def _restore_updates(self): """Recreates a dict of updates from the layer's weights.""" data_dict = {} for name, var in self.state_variables.items(): data_dict[name] = K.get_session().run(var) return data_dict
def _eval(self, tensor): if self.use_v1_apis: return K.get_session().run(tensor) else: return tensor.numpy()
import tensorflow as tf import tensorflow.python.keras.backend as K from tensorflow.python.framework.graph_util import convert_variables_to_constants from keras.models import load_model def freeze_graph(session, keep_var_names=None, output_names=None, clear_devices=True): graph = session.graph with graph.as_default(): freeze_var_names = list(set(v.op.name for v in tf.compat.v1.global_variables()).difference(keep_var_names or [])) output_names = output_names or [] output_names += [v.op.name for v in tf.compat.v1.global_variables()] input_graph_def = graph.as_graph_def() if clear_devices: for node in input_graph_def.node: node.device = '' frozen_graph = convert_variables_to_constants(sess=session, input_graph_def=input_graph_def, output_node_names=output_names, variable_names_whitelist=freeze_var_names) return frozen_graph model = load_model('Image_Classifier.h5') K.set_learning_phase(1) frozen_graph = freeze_graph(K.get_session(), output_names=[model.output.op.name]) tf.io.write_graph(frozen_graph, '.', 'Image_Classifier.pb', False)
def _get_available_devices(): return [x.name for x in backend.get_session().list_devices()]
overwrite=True, include_optimizer=True, save_format=None ) logger.info("Save the history") with open(args.output_history_file, 'wb') as file: pickle.dump(training_history.history, file) test_inputs = [X_test, test_preds] logger.info("Fit the model") predictions = model.predict(test_inputs) logger.error("Compute uncertainty metrics") logger.error("Compute mu pred. entropy") error, mu_entropy, pred_y = sess.run(predict_cross_entropy(y_test, test_preds)) logger.error("Compute variation ratios") voted_pred = K.get_session().run(voting(predictions)) logger.error("Compute beta pred. entropy") sampling_entropy_gal = sess.run(predict_dirichlet_entropy_gal(predictions)) logger.error("Compute rejection measures") rejection_measures = np.array( [list(get_rejection_measures(pred_y, np.argmax(y_test, axis=1), np.argsort(sampling_entropy_gal), rejection_point)) for rejection_point in range(1, pred_y.shape[0] - 10)]) rejection_measures_baseline = np.array( [list(get_rejection_measures(pred_y, np.argmax(y_test, axis=1), np.argsort(mu_entropy), rejection_point)) for rejection_point in range(1, pred_y.shape[0] - 10)]) rejection_measures_voting = np.array( [list(get_rejection_measures(pred_y, np.argmax(y_test, axis=1), np.argsort(voted_pred), rejection_point))
model.save(current_output_local+'/my_checkpoint_test', overwrite=True) # creates a HDF5 file 'my_model.h5' #del model # deletes the existing model #model = load_model('output/my_model.h5') ######## save method two https://jovianlin.io/saving-loading-keras-models/ # Save the weights model.save_weights(current_output_local+'model_weights.h5', overwrite=True) # Save the model architecture with open(current_output_local+'model_architecture.json', 'w') as f: f.write(model.to_json()) ####### ############save method three################# saver = tf.train.Saver() sess = backend.get_session() saver.save(sess, current_output_local) model.save(current_output_local+'my_model.h5') #print(model.get_weights()) ############ ###does not like to reopen old ## del model ## del saver ## ## model = load_model('keras_model.hdf5') ## ## saver = tf.train.Saver() ## sess = backend.get_session()
if data_augmentation: checkpoint_dir += '_aug/' # necessary !!! tf.compat.v1.disable_eager_execution() last_model = os.listdir(checkpoint_dir)[-1] chosen_model = 'Epoch_439_model.hp5' # chosen model = last_model save_pb = True if save_pb: h5_path = checkpoint_dir + chosen_model model = tf.keras.models.load_model(h5_path, compile=False) # save pb with K.get_session() as sess: output_names = [out.op.name for out in model.outputs] input_graph_def = sess.graph.as_graph_def() for node in input_graph_def.node: node.device = "" graph = graph_util.remove_training_nodes(input_graph_def) graph_frozen = graph_util.convert_variables_to_constants( sess, graph, output_names) tf.io.write_graph(graph_frozen, checkpoint_dir, 'model.pb', as_text=False) logging.info("save pb successfully!") # Load Frozen graph pb_file = checkpoint_dir + 'model.pb'
def _experimental_test_loop(model, iterator, verbose=0, steps=None, initialize_finalize_strategy=True): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. initialize_finalize_strategy: Should the strategy initialize and finalize functions be called. Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ current_strategy = model._distribution_strategy if initialize_finalize_strategy: K.get_session().run(current_strategy.initialize()) def _per_device_eval_function(model): model._make_eval_function() return (model._eval_function.inputs, model._eval_function.outputs, model._eval_function.updates_op, model._eval_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(0) def step_fn(ctx, inputs, targets): """Clones the model and calls make_eval_function.""" # TODO(priyag, sourabhbajaj): The model gets cloned every time # fit/test/predict is called. We should look into caching this keyed on # input shapes. clone_model_on_replicas( model, current_strategy, make_callback_model=False, inputs=inputs, targets=targets, mode=_Mode.TEST) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_replica( _per_device_eval_function, args=(model._grouped_model_test,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_test_function', **all_session_args) for label, output in zip(model.metrics_names, combined_fn.outputs): if label == 'loss': aggregation = distribute_lib.get_loss_reduction() else: # We aggregate all other metrics using mean for now. This is temporary # workaround until new metrics are in place. aggregation = variable_scope.VariableAggregation.MEAN ctx.set_last_step_output(label, output, aggregation) return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors): initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model_test)[0] distributed_training_utils.set_weights( current_strategy, distributed_model, orig_model_weights) assert steps is not None outs = [0.] * len(model.metrics_names) for step in range(steps): _, batch_outs = K.get_session().run([test_op, output_tensors]) for i, label in enumerate(model.metrics_names): outs[i] += batch_outs[label] if verbose >= 1: progbar.update(step + 1) for i in range(len(outs)): outs[i] /= (steps) if initialize_finalize_strategy: K.get_session().run(current_strategy.finalize()) if len(outs) == 1: return outs[0] return outs
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ dataset_fully_shaped = (distributed_training_utils. is_dataset_shape_fully_defined(dataset)) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batched_dataset = input_lib._get_batched_dataset(dataset) batch_size, _, prefetch_buffer = input_lib._get_batched_dataset_attributes( batched_dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.apply(batching.unbatch()) # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _per_device_predict_function(model): model._make_predict_function() return (model.predict_function.inputs, model.predict_function.outputs, model.predict_function.updates_op, model.predict_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_predict_function.""" if model._compile_distribution: distributed_training_utils. clone_model_on_replicas( model, current_strategy, ModeKeys.PREDICT, inputs=inputs) else: distributed_training_utils._build_distributed_network( model, current_strategy, ModeKeys.PREDICT, inputs) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_predict_function, args=(model._distributed_model_predict,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_predict_function', **all_session_args) for label, output in zip(model.output_names, combined_fn.outputs): ctx.set_last_step_output(label, output) return combined_fn.updates_op # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model( model, ModeKeys.PREDICT) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.PREDICT) assert steps is not None # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] for step in range(steps): _, batch_outs = K.get_session().run([predict_op, output_tensors]) # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) if verbose >= 1: progbar.update(step + 1) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def load(path, compile=True, options=None): # pylint: disable=redefined-builtin """Loads Keras objects from a SavedModel. Any Keras layer or model saved to the SavedModel will be loaded back as Keras objects. Other objects are loaded as regular trackable objects (same as `tf.saved_model.load`). Currently, Keras saving/loading only retains the Keras object's weights, losses, and call function. The loaded model can be re-compiled, but the original optimizer, compiled loss functions, and metrics are not retained. This is temporary, and `model.save` will soon be able to serialize compiled models. Args: path: Path to SavedModel. compile: If true, compile the model after loading it. options: Optional `tf.saved_model.LoadOptions` object that specifies options for loading from SavedModel. Returns: Object loaded from SavedModel. """ # TODO(kathywu): Add saving/loading of optimizer, compiled losses and metrics. # TODO(kathywu): Add code to load from objects that contain all endpoints # Look for metadata file or parse the SavedModel metadata = saved_metadata_pb2.SavedMetadata() meta_graph_def = loader_impl.parse_saved_model(path).meta_graphs[0] object_graph_def = meta_graph_def.object_graph_def path_to_metadata_pb = os.path.join(path, constants.SAVED_METADATA_PATH) if gfile.Exists(path_to_metadata_pb): try: with gfile.GFile(path_to_metadata_pb, 'rb') as f: file_content = f.read() metadata.ParseFromString(file_content) except message.DecodeError as e: raise IOError('Cannot parse keras metadata {}: {}.' .format(path_to_metadata_pb, str(e))) else: logging.warning('SavedModel saved prior to TF 2.5 detected when loading ' 'Keras model. Please ensure that you are saving the model ' 'with model.save() or tf.keras.models.save_model(), *NOT* ' 'tf.saved_model.save(). To confirm, there should be a file ' 'named "keras_metadata.pb" in the SavedModel directory.') _read_legacy_metadata(object_graph_def, metadata) if not metadata.nodes: # When there are no Keras objects, return the results from the core loader return tf_load.load(path, options=options) # Recreate layers and metrics using the info stored in the metadata. keras_loader = KerasObjectLoader(metadata, object_graph_def) keras_loader.load_layers(compile=compile) # Generate a dictionary of all loaded nodes. nodes_to_load = {'root': None} for node_id, loaded_node in keras_loader.loaded_nodes.items(): nodes_to_load[keras_loader.get_path(node_id)] = loaded_node loaded = tf_load.load_partial(path, nodes_to_load, options=options) # Finalize the loaded layers and remove the extra tracked dependencies. keras_loader.finalize_objects() keras_loader.del_tracking() model = loaded['root'] # pylint: disable=protected-access if isinstance(model, training_lib.Model) and compile: # TODO(kathywu): Use compiled objects from SavedModel, instead of # creating new objects from the training config. training_config = model._serialized_attributes['metadata'].get( 'training_config', None) if training_config is not None: model.compile(**saving_utils.compile_args_from_training_config( training_config)) saving_utils.try_build_compiled_arguments(model) else: logging.warning('No training configuration found in save file, so the ' 'model was *not* compiled. Compile it manually.') # pylint: enable=protected-access # Force variables and resources to initialize. if not context.executing_eagerly(): sess = backend.get_session() # Variables are initialized by this call. sess.run(ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)) return model
def _run(self, op): if self.use_v1_apis: K.get_session().run(op)
def initialize_iterator(iterator, distribution_strategy): with distribution_strategy.scope(): init_op = control_flow_ops.group(iterator.initialize()) if not context.executing_eagerly(): K.get_session((init_op, )).run(init_op)
model.add(layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh')) assert model.output_shape == (None, 28, 28, 1) return model generator = make_generator_model() noise = tf.random.normal([1, 100]) generated_image = generator(noise, training=False) your_session = K.get_session() array = generated_image[0, :, :, 0].eval(session=your_session) plt.imshow(array, cmap='gray') plt.show() def discriminator_model(): model = tf.keras.Sequential() model.add(layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same', input_shape=[28, 28, 1])) model.add(layers.LeakyReLU())
def experimental_test_loop(model, iterator, verbose=0, steps=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ current_strategy = model._distribution_strategy scope = current_strategy.scope() scope.__enter__() def _per_device_eval_function(model): model._make_eval_function() return (model._eval_function.inputs, model._eval_function.outputs, model._eval_function.updates_op, model._eval_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(0) def step_fn(ctx, inputs): """Clones the model and calls make_eval_function.""" inputs, targets = inputs if model._compile_distribution: clone_model_on_replicas(model, current_strategy, make_callback_model=False, inputs=inputs, targets=targets, mode=ModeKeys.TEST) else: _build_distributed_network(model, current_strategy, inputs, targets, mode=ModeKeys.TEST) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_eval_function, args=(model._distributed_model_test,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_test_function', **all_session_args) for label, output in zip(model.metrics_names, combined_fn.outputs): if label == 'loss': reduce_op = distribute_lib.get_loss_reduction() else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: with current_strategy.scope(): _copy_weights_to_distributed_model(model, model._distributed_model_test) with current_strategy.scope(): _reset_metrics(model, model._distributed_model_test) assert steps is not None outs = [0.] * len(model.metrics_names) for step in range(steps): _, batch_outs = K.get_session().run([test_op, output_tensors]) for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] if verbose >= 1: progbar.update(step + 1) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (steps) if len(outs) == 1: return outs[0] return outs
def _experimental_fit_loop(model, iterator, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ current_strategy = model._distribution_strategy # TODO(priyag): Add validation that shapes are fully defined for TPU case. K.get_session().run(current_strategy.initialize()) def _per_device_train_function(model): model._make_train_function() return (model.train_function.inputs, model.train_function.outputs, model.train_function.updates_op, model.train_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(1) def step_fn(ctx, inputs, targets): """Clones the model and calls make_train_function.""" # TODO(priyag, sourabhbajaj): The model gets cloned every time # fit/test/predict is called. We should look into caching this keyed on # input shapes. clone_model_on_towers(model, current_strategy, make_callback_model=True, inputs=inputs, targets=targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_tower( _per_device_train_function, model._grouped_model) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.Function(all_inputs, all_outputs, updates=all_updates, name='distributed_train_function', **all_session_args) out_labels = model.metrics_names or [] for label, output in zip(out_labels, combined_fn.outputs): if label == 'loss': aggregation = distribute_lib.get_loss_reduction() else: # We aggregate all other metrics using mean for now. This is temporary # workaround until new metrics are in place. aggregation = variable_scope.VariableAggregation.MEAN ctx.set_last_step_output(label, output, aggregation) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors): initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) if steps_per_epoch is None: raise ValueError( 'steps_per_epoch should be specified in the fit call.') steps_per_run_var = K.variable(value=min(steps_per_epoch, current_strategy.steps_per_run), dtype='int32', name='steps_per_run_var') with current_strategy.scope(): ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=steps_per_run_var, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model)[0] distributed_training_utils.set_weights(current_strategy, distributed_model, orig_model_weights) callbacks = cbks.configure_callbacks(callbacks, model, do_validation=False, val_inputs=None, val_targets=None, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose) # TODO(priyag, sourabhbajaj): Add callbacks support for per step callback # TODO(priyag, sourabhbajaj): Add validation. # Calculate the steps each time on the device. steps_to_run = [current_strategy.steps_per_run ] * (steps_per_epoch // current_strategy.steps_per_run) if steps_per_epoch % current_strategy.steps_per_run: steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run) callbacks.on_train_begin() for epoch in range(initial_epoch, epochs): callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None for step_count in steps_to_run: batch_logs = { 'batch': step_index, 'size': 1, 'num_steps': step_count } callbacks.on_batch_begin(step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run_var.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.get_session().run([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks.on_batch_end(step_index, batch_logs) step_index = step_index + step_count if callbacks.model.stop_training: break callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks.on_train_end() # Copy the weights back from the replicated model to the original model. with current_strategy.scope(): updated_weights = current_strategy.unwrap( model._grouped_model)[0].get_weights() model.set_weights(updated_weights) K.get_session().run(current_strategy.finalize()) return model.history
def experimental_fit_loop(model, iterator, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_iterator=None, validation_steps=None): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_iterator: Iterator for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ current_strategy = model._distribution_strategy scope = current_strategy.scope() scope.__enter__() def _per_device_fit_function(model): model._make_fit_function() return (model._fit_function.inputs, model._fit_function.outputs, model._fit_function.updates_op, model._fit_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(1) out_labels = model.metrics_names or [] def step_fn(ctx, inputs): """Clones the model and calls make_fit_function.""" inputs, targets = inputs if model._compile_distribution: clone_model_on_replicas(model, current_strategy, make_callback_model=True, inputs=inputs, targets=targets, mode=ModeKeys.TRAIN) else: _build_distributed_network(model, current_strategy, inputs, targets, mode=ModeKeys.TRAIN) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_fit_function, args=(model._distributed_model_train,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_fit_function', **all_session_args) for label, output in zip(out_labels, combined_fn.outputs): if label == 'loss': reduce_op = distribute_lib.get_loss_reduction() else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) if steps_per_epoch is None: raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model.') steps_per_run = K.variable( value=min(steps_per_epoch, current_strategy.extended.steps_per_run), dtype='int32', name='steps_per_run') with current_strategy.scope(): ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: with current_strategy.scope(): _copy_weights_to_distributed_model(model, model._distributed_model_train) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose) # Calculate the steps each time on the device. steps_to_run = [current_strategy.extended.steps_per_run] * ( steps_per_epoch // current_strategy.extended.steps_per_run) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) callbacks.on_train_begin() for epoch in range(initial_epoch, epochs): with current_strategy.scope(): _reset_metrics(model, model._distributed_model_train) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None for step_count in steps_to_run: batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks.on_batch_begin(step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.get_session().run([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks.on_batch_end(step_index, batch_logs) step_index = step_index + step_count if callbacks.model.stop_training: break if do_validation: logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. with current_strategy.scope(): _copy_weights_to_original_model( model, model._distributed_model_train, ModeKeys.TRAIN) val_outs = experimental_test_loop( # pylint: disable=undefined-variable model, val_iterator, steps=validation_steps, verbose=verbose) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks.on_train_end() if model._compile_distribution: # Copy the weights back from the replicated model to the original model. with current_strategy.scope(): _copy_weights_to_original_model(model, model._distributed_model_train, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def _experimental_test_loop(model, iterator, verbose=0, steps=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ current_strategy = model._distribution_strategy K.get_session().run(current_strategy.initialize()) def _per_device_test_function(model): model._make_test_function() return (model.test_function.inputs, model.test_function.outputs, model.test_function.updates_op, model.test_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(0) def step_fn(ctx, inputs, targets): """Clones the model and calls make_test_function.""" # TODO(priyag, sourabhbajaj): The model gets cloned every time # fit/test/predict is called. We should look into caching this keyed on # input shapes. clone_model_on_towers(model, current_strategy, make_callback_model=False, inputs=inputs, targets=targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_tower( _per_device_test_function, model._grouped_model) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.Function(all_inputs, all_outputs, updates=all_updates, name='distributed_test_function', **all_session_args) for label, output in zip(model.metrics_names, combined_fn.outputs): if label == 'loss': aggregation = distribute_lib.get_loss_reduction() else: # We aggregate all other metrics using mean for now. This is temporary # workaround until new metrics are in place. aggregation = variable_scope.VariableAggregation.MEAN ctx.set_last_step_output(label, output, aggregation) return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors): initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model)[0] distributed_training_utils.set_weights(current_strategy, distributed_model, orig_model_weights) assert steps is not None outs = [0.] * len(model.metrics_names) for step in range(steps): _, batch_outs = K.get_session().run([test_op, output_tensors]) for i, label in enumerate(model.metrics_names): outs[i] += batch_outs[label] if verbose >= 1: progbar.update(step + 1) for i in range(len(outs)): outs[i] /= (steps) K.get_session().run(current_strategy.finalize()) if len(outs) == 1: return outs[0] return outs
def _init_session(): from tensorflow.python.keras import backend sess = backend.get_session() tf.get_default_graph() set_session(sess) return sess
def _experimental_predict_loop(model, iterator, verbose=0, steps=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ current_strategy = model._distribution_strategy K.get_session().run(current_strategy.initialize()) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(0) def _per_device_predict_function(model): model._make_predict_function() return (model.predict_function.inputs, model.predict_function.outputs, model.predict_function.updates_op, model.predict_function.session_kwargs) def step_fn(ctx, inputs, targets): """Clones the model and calls make_predict_function.""" # TODO(anjalisridhar): Support predict input correctly as it will not # contain targets, only inputs. del targets # TODO(priyag, sourabhbajaj): The model gets cloned every time # fit/test/predict is called. We should look into caching this keyed on # input shapes. clone_model_on_towers(model, current_strategy, make_callback_model=False, inputs=inputs) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_tower( _per_device_predict_function, model._grouped_model) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.Function(all_inputs, all_outputs, updates=all_updates, name='distributed_predict_function', **all_session_args) for label, output in zip(model.output_names, combined_fn.outputs): ctx.set_last_step_output(label, output) return combined_fn.updates_op # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model)[0] distributed_training_utils.set_weights(current_strategy, distributed_model, orig_model_weights) assert steps is not None # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] for step in range(steps): _, batch_outs = K.get_session().run([predict_op, output_tensors]) # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) if verbose >= 1: progbar.update(step + 1) K.get_session().run(current_strategy.finalize()) if len(unconcatenated_outs) == 1: return np.concatenate(unconcatenated_outs[0], axis=0) return [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ]
global_step_tensor, 5000, 0.975, staircase=True) tf.summary.scalar('Learning_Rate', lr) optimizer = tf.train.AdamOptimizer(learning_rate=lr) apply_recons_op, apply_soft_op, avg_loss_recons, avg_loss_soft = create_parallel_optimization( create_wnet, iterator, optimizer, num_classes) with tf.name_scope('Loss'): tf.summary.scalar('Reconstruction_Loss', avg_loss_recons) tf.summary.scalar('Soft_N_Cut_Loss', avg_loss_soft) merged = tf.summary.merge_all() saver = tf.train.Saver() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) K_B.set_session(sess) with K_B.get_session() as sess: train_writer = tf.summary.FileWriter(logdir, sess.graph) init = tf.global_variables_initializer() sess.run(init) if exists(checkpt_dir): if tf.train.latest_checkpoint(checkpt_dir) is not None: tf.logging.info('Loading Checkpoint from ' + tf.train.latest_checkpoint(checkpt_dir)) saver.restore(sess, tf.train.latest_checkpoint(checkpt_dir)) else: tf.logging.info('Training from Scratch - No Checkpoint found') # img_lab = np.expand_dims(cv2.cvtColor(img, cv2.COLOR_BGR2LAB), axis=0) i = 0
def on_epoch_end(self, epoch, logs=None): """Checks if summary ops should run next epoch, logs scalar summaries.""" # don't output batch_size and # batch number as TensorBoard summaries logs = {('epoch_' + k): v for k, v in logs.items() if k not in ['batch', 'size', 'num_steps']} if self.update_freq == 'epoch': step = epoch else: step = self._samples_seen self._write_custom_summaries(step, logs) # pop the histogram summary op after each epoch if self.histogram_freq: # pylint: disable=protected-access if self.merged in self.model.test_function.fetches: self.model.test_function.fetches.remove(self.merged) if self.merged in self.model.test_function.fetch_callbacks: self.model.test_function.fetch_callbacks.pop(self.merged) # pylint: enable=protected-access if self.embeddings_data is None and self.embeddings_freq: raise ValueError('To visualize embeddings, embeddings_data must ' 'be provided.') if self.embeddings_freq and self.embeddings_data is not None: if epoch % self.embeddings_freq == 0: # We need a second forward-pass here because we're passing # the `embeddings_data` explicitly. This design allows to pass # arbitrary data as `embeddings_data` and results from the fact # that we need to know the size of the `tf.Variable`s which # hold the embeddings in `set_model`. At this point, however, # the `validation_data` is not yet set. embeddings_data = self.embeddings_data n_samples = embeddings_data[0].shape[0] i = 0 sess = K.get_session() while i < n_samples: step = min(self.batch_size, n_samples - i) batch = slice(i, i + step) if isinstance(self.model.input, list): feed_dict = { model_input: embeddings_data[idx][batch] for idx, model_input in enumerate(self.model.input) } else: feed_dict = { self.model.input: embeddings_data[0][batch] } feed_dict.update({self.batch_id: i, self.step: step}) if not isinstance(K.learning_phase(), int): feed_dict[K.learning_phase()] = False sess.run(self.assign_embeddings, feed_dict=feed_dict) self.saver.save( sess, os.path.join(self.log_dir, 'keras_embedding.ckpt'), epoch) i += self.batch_size
def _get_available_devices(): return [x.name for x in K.get_session().list_devices()]
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _per_device_eval_function(model): model._make_eval_function() return (model._eval_function.inputs, model._eval_function.outputs, model._eval_function.updates_op, model._eval_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_eval_function.""" inputs, targets = inputs if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode=mode, inputs=inputs, targets=targets) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs, targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_eval_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.TEST),)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_test_function', **all_session_args) for label, output in zip(model.metrics_names, combined_fn.outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) assert steps is not None outs = [0.] * len(model.metrics_names) for step in range(steps): batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) _, batch_outs = K.get_session().run([test_op, output_tensors]) for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) if verbose >= 1: progbar.update(step + 1) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (steps) if len(outs) == 1: return outs[0] return outs
def initialize_iterator(iterator, distribution_strategy): with distribution_strategy.scope(): init_op = control_flow_ops.group(iterator.initialize()) if not context.executing_eagerly(): K.get_session((init_op,)).run(init_op)
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self.sess = K.get_session() # only make histogram summary op if it hasn't already been made if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf_summary.histogram(mapped_weight_name, weight) if self.write_images: w_img = array_ops.squeeze(weight) shape = K.int_shape(w_img) if len(shape) == 2: # dense layer kernel case if shape[0] > shape[1]: w_img = array_ops.transpose(w_img) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1]) elif len(shape) == 3: # convnet case if K.image_data_format() == 'channels_last': # switch to channels_first to display # every kernel as a separate image w_img = array_ops.transpose(w_img, perm=[2, 0, 1]) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1]) elif len(shape) == 1: # bias case w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1]) else: # not possible to handle 3D convnets etc. continue shape = K.int_shape(w_img) assert len(shape) == 4 and shape[-1] in [1, 3, 4] tf_summary.image(mapped_weight_name, w_img) if self.write_grads: for weight in layer.trainable_weights: mapped_weight_name = weight.name.replace(':', '_') grads = model.optimizer.get_gradients(model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [grad.values if is_indexed_slices(grad) else grad for grad in grads] tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads) if hasattr(layer, 'output'): if isinstance(layer.output, list): for i, output in enumerate(layer.output): tf_summary.histogram('{}_out_{}'.format(layer.name, i), output) else: tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = self._writer_class(self.log_dir, self.sess.graph) else: self.writer = self._writer_class(self.log_dir) # If both embedding_freq and embeddings_data are available, we will # visualize embeddings. if self.embeddings_freq and self.embeddings_data is not None: self.embeddings_data = standardize_input_data(self.embeddings_data, model.input_names) # If embedding_layer_names are not provided, get all of the embedding # layers from the model. embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] self.assign_embeddings = [] embeddings_vars = {} self.batch_id = batch_id = array_ops.placeholder(dtypes.int32) self.step = step = array_ops.placeholder(dtypes.int32) for layer in self.model.layers: if layer.name in embeddings_layer_names: embedding_input = self.model.get_layer(layer.name).output embedding_size = np.prod(embedding_input.shape[1:]) embedding_input = array_ops.reshape(embedding_input, (step, int(embedding_size))) shape = (self.embeddings_data[0].shape[0], int(embedding_size)) embedding = variables.Variable( array_ops.zeros(shape), name=layer.name + '_embedding') embeddings_vars[layer.name] = embedding batch = state_ops.assign(embedding[batch_id:batch_id + step], embedding_input) self.assign_embeddings.append(batch) self.saver = saver.Saver(list(embeddings_vars.values())) # Create embeddings_metadata dictionary if isinstance(self.embeddings_metadata, str): embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings_vars.keys() } else: # If embedding_metadata is already a dictionary embeddings_metadata = self.embeddings_metadata try: from tensorboard.plugins import projector except ImportError: raise ImportError('Failed to import TensorBoard. Please make sure that ' 'TensorBoard integration is complete."') # TODO(psv): Add integration tests to test embedding visualization # with TensorBoard callback. We are unable to write a unit test for this # because TensorBoard dependency assumes TensorFlow package is installed. config = projector.ProjectorConfig() for layer_name, tensor in embeddings_vars.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name if (embeddings_metadata is not None and layer_name in embeddings_metadata): embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def _experimental_fit_loop( model, iterator, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_iterator=None, validation_steps=None): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_iterator: Iterator for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ current_strategy = model._distribution_strategy K.get_session().run(current_strategy.initialize()) def _per_device_fit_function(model): model._make_fit_function() return (model._fit_function.inputs, model._fit_function.outputs, model._fit_function.updates_op, model._fit_function.session_kwargs) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(1) out_labels = model.metrics_names or [] def step_fn(ctx, inputs, targets): """Clones the model and calls make_fit_function.""" # TODO(priyag, sourabhbajaj): The model gets cloned every time # fit/test/predict is called. We should look into caching this keyed on # input shapes. clone_model_on_replicas( model, current_strategy, make_callback_model=True, inputs=inputs, targets=targets, mode=_Mode.TRAIN) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_replica( _per_device_fit_function, args=(model._grouped_model_train,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_fit_function', **all_session_args) for label, output in zip(out_labels, combined_fn.outputs): if label == 'loss': aggregation = distribute_lib.get_loss_reduction() else: # We aggregate all other metrics using mean for now. This is temporary # workaround until new metrics are in place. aggregation = variable_scope.VariableAggregation.MEAN ctx.set_last_step_output(label, output, aggregation) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors): initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) if steps_per_epoch is None: raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model.') steps_per_run = K.variable( value=min(steps_per_epoch, current_strategy.steps_per_run), dtype='int32', name='steps_per_run') with current_strategy.scope(): ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model_train)[0] distributed_training_utils.set_weights( current_strategy, distributed_model, orig_model_weights) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, val_inputs=None, val_targets=None, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose) # Calculate the steps each time on the device. steps_to_run = [current_strategy.steps_per_run] * ( steps_per_epoch // current_strategy.steps_per_run) if steps_per_epoch % current_strategy.steps_per_run: steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run) callbacks.on_train_begin() for epoch in range(initial_epoch, epochs): callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None for step_count in steps_to_run: batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks.on_batch_begin(step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.get_session().run([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks.on_batch_end(step_index, batch_logs) step_index = step_index + step_count if callbacks.model.stop_training: break if do_validation: logging.info('Running validation at fit epoch: %s', epoch) # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. with current_strategy.scope(): updated_weights = current_strategy.unwrap( model._grouped_model_train)[0].get_weights() model.set_weights(updated_weights) val_outs = _experimental_test_loop( model, val_iterator, steps=validation_steps, verbose=verbose, initialize_finalize_strategy=False) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks.on_train_end() # Copy the weights back from the replicated model to the original model. with current_strategy.scope(): updated_weights = current_strategy.unwrap( model._grouped_model_train)[0].get_weights() model.set_weights(updated_weights) K.get_session().run(current_strategy.finalize()) return model.history
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps_per_epoch = training_utils.infer_steps_for_dataset( dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if (current_strategy.extended.steps_per_run != 1 and steps_per_epoch is None): raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model with TPUStrategy when ' '`steps_per_run` != 1 .') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() def _per_device_fit_function(model): model._make_fit_function() return (model._fit_function.inputs, model._fit_function.outputs, model._fit_function.updates_op, model._fit_function.session_kwargs) out_labels = model.metrics_names or [] def step_fn(ctx, inputs): """Clones the model and calls make_fit_function.""" inputs, targets = inputs if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode, inputs=inputs, targets=targets) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs, targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_fit_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.TRAIN),)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_fit_function', **all_session_args) for label, output in zip(out_labels, combined_fn.outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) use_steps = steps_per_epoch is not None if use_steps: iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) else: iteration_value = current_strategy.extended.steps_per_run steps_per_run = K.variable( value=iteration_value, dtype='int32', name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. if use_steps: steps_to_run = ([current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) else: target_steps = np.inf callbacks._call_begin_hook(mode) for epoch in range(initial_epoch, epochs): distributed_training_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] if use_steps else 1 batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.get_session().run([train_op, output_tensors]) except errors.OutOfRangeError: if use_steps: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) else: target_steps = current_step logging.info('Dataset iterator ran out of data. Inferring the ' 'value of `steps_per_epoch` as %s .' % target_steps) distributed_training_utils.initialize_iterator(iterator, current_strategy) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def _experimental_predict_loop(model, iterator, verbose=0, steps=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. iterator: Iterator for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ current_strategy = model._distribution_strategy K.get_session().run(current_strategy.initialize()) # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here. K.set_learning_phase(0) def _per_device_predict_function(model): model._make_predict_function() return (model.predict_function.inputs, model.predict_function.outputs, model.predict_function.updates_op, model.predict_function.session_kwargs) def step_fn(ctx, *inputs): """Clones the model and calls make_predict_function.""" # TODO(priyag, sourabhbajaj): The model gets cloned every time # fit/test/predict is called. We should look into caching this keyed on # input shapes. clone_model_on_replicas( model, current_strategy, make_callback_model=False, inputs=inputs, mode=_Mode.PREDICT) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.call_for_each_replica( _per_device_predict_function, args=(model._grouped_model_predict,)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_predict_function', **all_session_args) for label, output in zip(model.output_names, combined_fn.outputs): ctx.set_last_step_output(label, output) return combined_fn.updates_op # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) with current_strategy.scope(): # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.run_steps_on_dataset( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) # Copy the weights from the original model to each of the replicated models. orig_model_weights = model.get_weights() with current_strategy.scope(): distributed_model = current_strategy.unwrap(model._grouped_model_predict)[0] distributed_training_utils.set_weights( current_strategy, distributed_model, orig_model_weights) assert steps is not None # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] for step in range(steps): _, batch_outs = K.get_session().run([predict_op, output_tensors]) # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) if verbose >= 1: progbar.update(step + 1) K.get_session().run(current_strategy.finalize()) if len(unconcatenated_outs) == 1: return np.concatenate(unconcatenated_outs[0], axis=0) return [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ]
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _per_device_eval_function(model): model._make_eval_function() return (model._eval_function.inputs, model._eval_function.outputs, model._eval_function.updates_op, model._eval_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_eval_function.""" inputs, targets = inputs if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode=mode, inputs=inputs, targets=targets) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs, targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_eval_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.TEST),)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_test_function', **all_session_args) for label, output in zip(model.metrics_names, combined_fn.outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) outs = [0.] * len(model.metrics_names) if steps is not None: target_steps = steps else: target_steps = np.inf current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.get_session().run([test_op, output_tensors]) except errors.OutOfRangeError: if steps is not None: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) else: warning_msg = 'Number of steps ran: {} steps'.format(current_step) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) target_steps = current_step break for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose >= 1: progbar.update(current_step + 1) current_step += 1 callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (target_steps) if len(outs) == 1: return outs[0] return outs
def main(argv): logging.info('Building Keras ResNet-50 model') model = tf.keras.applications.resnet50.ResNet50(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=NUM_CLASSES) if FLAGS.use_tpu: logging.info('Converting from CPU to TPU model.') resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver) model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) session_master = resolver.master() else: session_master = '' logging.info('Compiling model.') model.compile(optimizer=tf.keras.optimizers.SGD(lr=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) callbacks = [LearningRateBatchScheduler(schedule=learning_rate_schedule)] if FLAGS.model_dir: callbacks.append( tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir)) if FLAGS.data is None: training_images = np.random.randn(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3).astype(np.float32) training_labels = np.random.randint(NUM_CLASSES, size=BATCH_SIZE, dtype=np.int32) logging.info('Training model using synthetica data.') model.fit(training_images, training_labels, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks) logging.info('Evaluating the model on synthetic data.') model.evaluate(training_images, training_labels, verbose=0) else: imagenet_train = imagenet_input.ImageNetInput( is_training=True, use_bfloat16=FLAGS.use_bfloat16, data_dir=FLAGS.data, per_core_batch_size=PER_CORE_BATCH_SIZE) logging.info('Training model using real data in directory "%s".', FLAGS.data) model.fit(imagenet_train.input_fn, epochs=EPOCHS, steps_per_epoch=TRAINING_STEPS_PER_EPOCH, callbacks=callbacks) logging.info('Evaluating the model on the validation dataset.') if FLAGS.eval_top_5_accuracy: logging.info('Evaluating top 1 and top 5 accuracy using a Python ' 'generator.') # We feed the inputs from a Python generator, so we need to build a single # batch for all of the cores, which will be split on TPU. imagenet_eval = imagenet_input.ImageNetInput( is_training=False, use_bfloat16=FLAGS.use_bfloat16, data_dir=FLAGS.data, per_core_batch_size=BATCH_SIZE) score = eval_utils.multi_top_k_accuracy( model, imagenet_eval.evaluation_generator(K.get_session()), EVAL_STEPS) else: imagenet_eval = imagenet_input.ImageNetInput( is_training=False, use_bfloat16=FLAGS.use_bfloat16, data_dir=FLAGS.data, per_core_batch_size=PER_CORE_BATCH_SIZE) score = model.evaluate(imagenet_eval.input_fn, steps=EVAL_STEPS, verbose=1) print('Evaluation score', score) if HAS_H5PY: weights_file = os.path.join( FLAGS.model_dir if FLAGS.model_dir else '/tmp', WEIGHTS_TXT) logging.info('Save weights into %s', weights_file) model.save_weights(weights_file, overwrite=True)
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps_per_epoch = training_utils.infer_steps_for_dataset( dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if (current_strategy.extended.steps_per_run != 1 and steps_per_epoch is None): raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model with TPUStrategy when ' '`steps_per_run` != 1 .') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() out_labels = model.metrics_names or [] step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) use_steps = steps_per_epoch is not None if use_steps: iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) else: iteration_value = current_strategy.extended.steps_per_run steps_per_run = K.variable( value=iteration_value, dtype='int32', name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. if use_steps: steps_to_run = ([current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) else: target_steps = np.inf callbacks._call_begin_hook(mode) for epoch in range(initial_epoch, epochs): distributed_training_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] if use_steps else 1 batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.batch_get_value([train_op, output_tensors]) except errors.OutOfRangeError: if use_steps: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) else: target_steps = current_step logging.info('Dataset iterator ran out of data. Inferring the ' 'value of `steps_per_epoch` as %s .' % target_steps) distributed_training_utils.initialize_iterator(iterator, current_strategy) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() def _per_device_fit_function(model): model._make_fit_function() return (model._fit_function.inputs, model._fit_function.outputs, model._fit_function.updates_op, model._fit_function.session_kwargs) out_labels = model.metrics_names or [] def step_fn(ctx, inputs): """Clones the model and calls make_fit_function.""" inputs, targets = inputs if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode, inputs=inputs, targets=targets) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs, targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_fit_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.TRAIN),)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_fit_function', **all_session_args) for label, output in zip(out_labels, combined_fn.outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) if steps_per_epoch is None: raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model.') steps_per_run = K.variable( value=min(steps_per_epoch, current_strategy.extended.steps_per_run), dtype='int32', name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. steps_to_run = [current_strategy.extended.steps_per_run] * ( steps_per_epoch // current_strategy.extended.steps_per_run) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) callbacks._call_begin_hook(mode) for epoch in range(initial_epoch, epochs): distributed_training_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None for step_count in steps_to_run: batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.get_session().run([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def save_graph(self, path): session = K.get_session() tf.train.write_graph(K.get_graph(), path, "model", as_text=True)
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ mode = ModeKeys.PREDICT dataset_fully_shaped = (distributed_training_utils. is_dataset_shape_fully_defined(dataset)) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.apply(batching.unbatch()) # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _per_device_predict_function(model): model._make_predict_function() return (model.predict_function.inputs, model.predict_function.outputs, model.predict_function.updates_op, model.predict_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_predict_function.""" if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode, inputs=inputs) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) = current_strategy.extended.call_for_each_replica( _per_device_predict_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.PREDICT),)) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function( all_inputs, all_outputs, updates=all_updates, name='distributed_predict_function', **all_session_args) for label, output in zip(model.output_names, combined_fn.outputs): ctx.set_last_step_output(label, output) return combined_fn.updates_op # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=mode) callbacks._call_begin_hook(mode) assert steps is not None # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] for step in range(steps): batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) _, batch_outs = K.get_session().run([predict_op, output_tensors]) # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) if verbose >= 1: progbar.update(step + 1) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def _get_var_for_numpy(distribution_strategy, input_array): """Creates a variable and assigns the value of the numpy array to it. Args: distribution_strategy: The DistributionStrategy used to compile the model. input_array: The input numpy array whose value will be assigned to the variable we create. Returns: The variable to which we will copy the value of the input numpy array. """ with ops.device(get_cpu_device(distribution_strategy)): # Create and initialize a variable on the CPU device. This is the CPU # device of the host in the case of TPUDistributionStrategy. input_var = variables.VariableV1(array_ops.zeros( input_array.shape, input_array.dtype), trainable=False, use_resource=True) K.get_session().run(input_var.initializer) # Create a placeholder for the numpy array input slices. We copy the value # of the input numpy array to the variable in slices of size 64 MB to avoid # running into memory issues or RPC message limits. start_placeholder = array_ops.placeholder(dtypes.int64, ()) end_placeholder = array_ops.placeholder(dtypes.int64, ()) slice_placeholder = array_ops.placeholder(input_var.dtype) assign_slice_op = input_var[start_placeholder:end_placeholder].assign( slice_placeholder) # If each batch element is > 64 MB, then we copy each batch element # individually. Otherwise, the slices will be < 128 MB. There might be padding # which might mean that the slices are 128 MB even if the size of the # tensor allocated is less than 128 MB. # This formula gives slices with size: # ceil(64 MB / byte size per batch element) bytes. # Using ceil() guarantees we get a number >= 1. # Calculate the size of each batch element. byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \ input_var.dtype.size # Calculate number of elements we want to copy per slice. batch_size_per_slice = int( np.ceil((64 << 20) / byte_size_per_batch_element)) # Copy slices of the above size starting at 0, except the last slice will be # smaller. start = 0 limit = input_array.shape[0] while start < limit: end = min(start + batch_size_per_slice, limit) K.get_session().run(assign_slice_op, feed_dict={ start_placeholder: start, end_placeholder: end, slice_placeholder: input_array[start:end] }) start = end return input_var