def predict(self, model, x, batch_size=None, verbose=0, steps=None, callbacks=None, **kwargs): """Predict loop for Distribution Strategies.""" dist_utils.validate_inputs(x=x, y=None) batch_size, steps = self._process_batch_and_step_size( model, x, batch_size, steps, ModeKeys.PREDICT) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) dataset = model._distribution_standardize_user_data( x, batch_size=batch_size, allow_partial_batch=True) if dist_utils.is_tpu_strategy(model._distribution_strategy): steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') if steps is None: raise ValueError( 'Number of steps could not be infered from the data, ' 'please pass the steps argument.') if not context.executing_eagerly(): return experimental_tpu_predict_loop(model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) return training_arrays.predict_loop(model, dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def predict_distributed(model, x=None, batch_size=None, verbose=0, steps=None, callbacks=None): """Predict loop for Distribution Strategies.""" distributed_training_utils.validate_inputs(x, None, model._distribution_strategy, allow_partial_batch=True) first_x_value = nest.flatten(x)[0] if isinstance(first_x_value, np.ndarray): steps, batch_size = distributed_training_utils.get_input_params( model._distribution_strategy, first_x_value, steps, batch_size, mode=ModeKeys.PREDICT) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) steps_name = 'steps' if isinstance(x, dataset_ops.DatasetV2): steps = training_utils.infer_steps_for_dataset(x, steps, steps_name=steps_name) dataset = model._distribution_standardize_user_data( x, batch_size=batch_size, check_steps=True, steps_name=steps_name, steps=steps, repeat=False, allow_partial_batch=True) if distributed_training_utils.is_tpu_strategy( model._distribution_strategy): return experimental_tpu_predict_loop(model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) else: return training_arrays.predict_loop(model, dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def evaluate(self, model, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None, **kwargs): """Evaluate loop for Distribution Strategies.""" dist_utils.validate_inputs(x, y) batch_size, steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, batch_size=batch_size, allow_partial_batch=True) if dist_utils.is_tpu_strategy(model._distribution_strategy): steps = training_utils.infer_steps_for_dataset(model, dataset, steps, steps_name='steps') if steps is None: raise ValueError( 'Number of steps could not be inferred from the data, ' 'please pass the steps argument.') if not context.executing_eagerly(): # Run TPU evaluation in a custom loop in graph mode. return experimental_tpu_test_loop(model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) return training_arrays_v1.test_loop(model, inputs=dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def evaluate_distributed(model, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None): """Evaluate loop for Distribution Strategies.""" distributed_training_utils.validate_inputs(x, y, model._distribution_strategy) first_x_value = nest.flatten(x)[0] if isinstance(first_x_value, np.ndarray): steps, batch_size = distributed_training_utils.get_input_params( model._distribution_strategy, first_x_value, steps, batch_size) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) steps_name = 'steps' if isinstance(x, dataset_ops.DatasetV2): steps = training_utils.infer_steps_for_dataset(x, steps, steps_name=steps_name) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, batch_size=batch_size, check_steps=True, steps_name=steps_name, steps=steps) if distributed_training_utils.is_tpu_strategy( model._distribution_strategy): return experimental_tpu_test_loop(model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) else: return training_arrays.test_loop(model, inputs=dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def evaluate_distributed(model, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None): """Evaluate loop for Distribution Strategies.""" distributed_training_utils.validate_inputs(x, y, model._distribution_strategy) first_x_value = nest.flatten(x)[0] if isinstance(first_x_value, np.ndarray): steps, batch_size = distributed_training_utils.get_input_params( model._distribution_strategy, first_x_value, steps, batch_size) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) steps_name = 'steps' if isinstance(x, dataset_ops.DatasetV2): steps = training_utils.infer_steps_for_dataset(x, steps, steps_name=steps_name) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, batch_size=batch_size, check_steps=True, steps_name=steps_name, steps=steps) if distributed_training_utils.is_tpu_strategy(model._distribution_strategy): return experimental_tpu_test_loop( model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) else: return training_arrays.test_loop( model, inputs=dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def predict_distributed(model, x=None, batch_size=None, verbose=0, steps=None, callbacks=None): """Predict loop for Distribution Strategies.""" distributed_training_utils.validate_inputs( x, None, model._distribution_strategy, allow_partial_batch=True) first_x_value = nest.flatten(x)[0] if isinstance(first_x_value, np.ndarray): steps, batch_size = distributed_training_utils.get_input_params( model._distribution_strategy, first_x_value, steps, batch_size, mode=ModeKeys.PREDICT) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) steps_name = 'steps' if isinstance(x, dataset_ops.DatasetV2): steps = training_utils.infer_steps_for_dataset(x, steps, steps_name=steps_name) dataset = model._distribution_standardize_user_data( x, batch_size=batch_size, check_steps=True, steps_name=steps_name, steps=steps, repeat=False, allow_partial_batch=True) if distributed_training_utils.is_tpu_strategy(model._distribution_strategy): return experimental_tpu_predict_loop( model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) else: return training_arrays.predict_loop( model, dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU tf.distribute.Strategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps_per_epoch = training_utils.infer_steps_for_dataset( dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() out_labels = model.metrics_names or [] step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) if steps_per_epoch is not None: iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) else: raise ValueError('Number of steps could not be infered from the data, ' 'please pass the steps_per_epoch argument.') steps_per_run = K.variable( value=iteration_value, dtype='int32', name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. steps_to_run = ([current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) callbacks._call_begin_hook(mode) initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode) for epoch in range(initial_epoch, epochs): distributed_training_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.batch_get_value([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def model_iteration(model, inputs, targets=None, sample_weights=None, batch_size=None, epochs=1, verbose=1, callbacks=None, val_inputs=None, val_targets=None, val_sample_weights=None, shuffle=True, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, mode=ModeKeys.TRAIN, validation_in_fit=False, prepared_feed_values_from_dataset=False, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. inputs: Either a list or dictionary of arrays, or a dataset instance. targets: List/dictionary of input arrays. sample_weights: Optional list of sample weight arrays. batch_size: Integer batch size or None if unknown. epochs: Number of times to iterate over the data verbose: 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. Note that the progress bar is not particularly useful when logged to a file, so verbose=2 is recommended when not running interactively (eg, in a production environment). callbacks: List of callbacks to be called during training val_inputs: Either a list or dictionary of arrays, or a dataset instance. val_targets: List/dictionary of target arrays. val_sample_weights: Optional list of sample weight arrays. shuffle: Whether to shuffle the data at the beginning of each epoch concatenation of list the display names of the outputs of `f` and the list of display names of the outputs of `f_val`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections_abc.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. validation_in_fit: if true, then this method is invoked from within training iteration (for validation). In the case where `val_inputs` is a dataset, this flag indicates that its iterator and feed values are already created so should properly reuse resources. prepared_feed_values_from_dataset: if True, `inputs` is a list of feed tensors returned from `_prepare_feed_values` call on the validation dataset, so do not call it again on `inputs`. Should only be used for inline validation (i.e., only if `validation_in_fit` is also True). steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ # Backwards compatibility. if 'steps' in kwargs: steps_per_epoch = kwargs.pop('steps') if kwargs: raise TypeError('Unknown arguments: %s' % (kwargs, )) # In case we were passed a dataset, we extract symbolic tensors from it. reset_dataset_after_each_epoch = False input_iterator = None is_dataset = isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) # TODO(fchollet): consider moving `steps_per_epoch` inference to # _standardize_user_data and set reset_dataset_after_each_epoch as an # attribute on the dataset instance. if is_dataset: if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( model, inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name) input_iterator = _get_iterator(inputs, model._distribution_strategy) # Enter tf.distribute.Strategy scope. if model._distribution_strategy: scope = distributed_training_utils.distributed_scope( strategy=model._distribution_strategy, learning_phase=(1 if mode == ModeKeys.TRAIN else 0)) scope.__enter__() use_steps = is_dataset or steps_per_epoch is not None do_validation = val_inputs is not None # Convert Eager Tensors to NumPy arrays to support batching/shuffling. inputs, targets, sample_weights = training_utils. \ convert_eager_tensors_to_numpy((inputs, targets, sample_weights)) # Prepare input data. inputs = input_iterator or inputs if validation_in_fit and prepared_feed_values_from_dataset: # When invoking validation in training loop, avoid creating iterator and # list of feed values for the same validation dataset multiple times (which # essentially would call `iterator.get_next()` that slows down execution and # leads to OOM errors eventually. ins = inputs else: ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode) # `ins` is a function when a distribute strategy is used in Eager mode. In # that case `is_dataset` is True. The code branches that have requirements # about the type of `ins` do not trigger in the distributed case. if not is_dataset: num_samples_or_steps = _get_num_samples_or_steps( ins, batch_size, steps_per_epoch) else: num_samples_or_steps = steps_per_epoch # Update sample_weight_mode of the model if sample_weights is specified by the # user. We need to call this function after we have a handle on the inputs # (both numpy arrays and datasets) in order to determine if the user has # specified sample_weights. _update_sample_weight_mode(model, mode, ins) # Get step function and loop type. As part of building the execution # function we recompile the metrics based on the updated # sample_weight_mode value. f = _make_execution_function(model, mode) # Prepare validation data. Hold references to the iterator and the input list # to properly reinitialize and reuse in multiple validation passes. val_iterator = None if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)): if validation_steps is None: # Because we pass an iterator feed instead of a Dataset to the eval # model_iteration() call, it will not trigger the dataset-input path # that determines the number of steps required. To avoid this issue, # set validation_steps here if validation_steps is None. validation_steps = training_utils.infer_steps_for_dataset( model, val_inputs, validation_steps, epochs=epochs, steps_name='validation_steps') val_iterator = _get_iterator(val_inputs, model._distribution_strategy) val_inputs = _prepare_feed_values(model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST) # Get num steps for printing. val_samples_or_steps = validation_steps else: # Get num samples for printing. val_samples_or_steps = val_inputs and nest.flatten( val_inputs)[0].shape[0] or None if mode == ModeKeys.TRAIN and verbose: _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset) # Configure callbacks. count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. progbar = training_utils.get_progbar(model, count_mode, mode != ModeKeys.PREDICT) progbar.params = callbacks.params progbar.params['verbose'] = verbose # Find beforehand arrays that need sparse-to-dense conversion. if issparse is not None and not use_steps: indices_for_conversion_to_dense = [] feed = _get_model_feed(model, mode) for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)): if issparse(input_data) and not K.is_sparse(feed_tensor): indices_for_conversion_to_dense.append(i) # Select aggregation method. if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator( use_steps, num_samples=None if steps_per_epoch else num_samples_or_steps, steps=steps_per_epoch) else: aggregator = training_utils.MetricsAggregator( use_steps, num_samples=None if steps_per_epoch else num_samples_or_steps, steps=steps_per_epoch) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model( model, mode) callbacks.model.stop_training = False callbacks._call_begin_hook(mode) progbar.on_train_begin() initial_epoch = model._maybe_load_initial_epoch_from_ckpt( initial_epoch, mode) for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch epoch_logs = {} if mode != ModeKeys.PREDICT: # Collecting and resetting metrics has non-zero cost and will needlessly # slow down model.predict. model.reset_metrics() if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) progbar.on_epoch_begin(epoch, epoch_logs) if use_steps: # Step-wise loop. if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) progbar.on_batch_begin(step, batch_logs) # Get outputs. try: # `ins` can be callable in tf.distribute.Strategy + eager case. if not callable(ins) or ( model._distribution_strategy and not distributed_training_utils. is_distributing_by_cloning(model)): actual_inputs = ins else: actual_inputs = ins() batch_outs = f(actual_inputs) except errors.OutOfRangeError: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break if not isinstance(batch_outs, list): batch_outs = [batch_outs] if model._distribution_strategy: batch_outs = distributed_training_utils._per_replica_aggregate_batch( model._distribution_strategy, batch_outs, model, mode) # Aggregate results. if step == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) progbar.on_batch_end(step, batch_logs) step += 1 if callbacks.model.stop_training: break else: # Sample-wise loop. index_array = np.arange(num_samples_or_steps) if shuffle == 'batch': index_array = training_utils.batch_shuffle( index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(num_samples_or_steps, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] # Slice into a batch. if len(batches) == 1: # If we only have one batch, do not slice. This takes care of # composite tensors in non-Dataset modes; we currently don't support # slicing them. # TODO(b/133517906): Add slicing support. ins_batch = ins else: try: if ins and isinstance(ins[-1], int): # Do not slice the training phase flag. ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_arrays(ins, batch_ids) except TypeError: raise TypeError('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') # Sparse to dense conversion. if issparse is not None: for i in indices_for_conversion_to_dense: ins_batch[i] = ins_batch[i].toarray() # Callbacks batch_begin. batch_logs = {'batch': batch_index, 'size': len(batch_ids)} callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs) progbar.on_batch_begin(batch_index, batch_logs) # Get outputs. batch_outs = f(ins_batch) if not isinstance(batch_outs, list): batch_outs = [batch_outs] # Aggregate results. if batch_index == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs, batch_start, batch_end) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs) progbar.on_batch_end(batch_index, batch_logs) if callbacks.model.stop_training: break aggregator.finalize() results = aggregator.results epoch_logs = cbks.make_logs(model, epoch_logs, results, mode) if len(results) == 1: results = results[0] # Run the test loop every `validation_freq` epochs during training. if (do_validation and training_utils.should_run_validation( validation_freq, epoch) and not callbacks.model.stop_training): if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_results = model_iteration( model, val_inputs, targets=val_targets, sample_weights=val_sample_weights, batch_size=batch_size, steps_per_epoch=validation_steps, callbacks=callbacks, verbose=0, mode=ModeKeys.TEST, validation_in_fit=True, prepared_feed_values_from_dataset=(val_iterator is not None), steps_name='validation_steps') if not isinstance(val_results, list): val_results = [val_results] epoch_logs = cbks.make_logs(model, epoch_logs, val_results, mode, prefix='val_') if val_iterator and epoch < epochs - 1: _reinitialize_iterator(val_iterator, model._distribution_strategy) if mode == ModeKeys.TRAIN: # Epochs only apply to `fit`. callbacks.on_epoch_end(epoch, epoch_logs) progbar.on_epoch_end(epoch, epoch_logs) # Reinitialize dataset iterator for the next epoch. if reset_dataset_after_each_epoch and epoch < epochs - 1: _reinitialize_iterator(input_iterator, model._distribution_strategy) callbacks._call_end_hook(mode) if model._distribution_strategy: if model._compile_distribution: # TODO(priyag, psv): Copy back metrics to the original model as well? distributed_training_utils._copy_weights_to_original_model( model, mode) scope.__exit__(None, None, None) if mode == ModeKeys.TRAIN: return model.history return results
def fit(self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, **kwargs): """Fit loop for Distribution Strategies.""" dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) dist_utils.validate_inputs(x, y) batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN) batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle, epochs=epochs) if not dist_utils.is_distributing_by_cloning(model): with model._distribution_strategy.scope(): (dataset, _, _) = model._standardize_user_data( dataset, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle) val_dataset = None if validation_data: val_x, val_y, val_sample_weights = training_utils.unpack_validation_data( validation_data) dist_utils.validate_inputs(val_x, val_y) _, validation_steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, val_x, batch_size, validation_steps, ModeKeys.TEST) val_dataset = model._distribution_standardize_user_data( val_x, val_y, sample_weight=val_sample_weights, class_weight=None, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle, allow_partial_batch=True) elif validation_split: raise ValueError('validation_split argument is not supported with ' 'distribution strategies.') if dist_utils.is_tpu_strategy(model._distribution_strategy): steps_per_epoch = training_utils.infer_steps_for_dataset( dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if steps_per_epoch is None: raise ValueError( 'Number of steps could not be inferred from the data, ' 'please pass the steps_per_epoch argument.') if not context.executing_eagerly(): # Run TPU training in a custom loop in graph mode. return experimental_tpu_fit_loop( model, dataset, epochs=epochs, verbose=verbose, callbacks=callbacks, val_dataset=val_dataset, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq) return training_arrays.fit_loop(model, dataset, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, val_inputs=val_dataset, shuffle=shuffle, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq, steps_name='steps_per_epoch')
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps_per_epoch = training_utils.infer_steps_for_dataset( dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if (current_strategy.extended.steps_per_run != 1 and steps_per_epoch is None): raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model with TPUStrategy when ' '`steps_per_run` != 1 .') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() out_labels = model.metrics_names or [] step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) use_steps = steps_per_epoch is not None if use_steps: iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) else: iteration_value = current_strategy.extended.steps_per_run steps_per_run = K.variable( value=iteration_value, dtype='int32', name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. if use_steps: steps_to_run = ([current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) else: target_steps = np.inf callbacks._call_begin_hook(mode) for epoch in range(initial_epoch, epochs): distributed_training_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] if use_steps else 1 batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.batch_get_value([train_op, output_tensors]) except errors.OutOfRangeError: if use_steps: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) else: target_steps = current_step logging.info('Dataset iterator ran out of data. Inferring the ' 'value of `steps_per_epoch` as %s .' % target_steps) distributed_training_utils.initialize_iterator(iterator, current_strategy) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def model_iteration(model, inputs, targets=None, sample_weights=None, batch_size=None, epochs=1, verbose=1, callbacks=None, val_inputs=None, val_targets=None, val_sample_weights=None, shuffle=True, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, mode=ModeKeys.TRAIN, validation_in_fit=False, prepared_feed_values_from_dataset=False, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. inputs: Either a list or dictionary of arrays, or a dataset instance. targets: List/dictionary of input arrays. sample_weights: Optional list of sample weight arrays. batch_size: Integer batch size or None if unknown. epochs: Number of times to iterate over the data verbose: Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training val_inputs: Either a list or dictionary of arrays, or a dataset instance. val_targets: List/dictionary of target arrays. val_sample_weights: Optional list of sample weight arrays. shuffle: Whether to shuffle the data at the beginning of each epoch concatenation of list the display names of the outputs of `f` and the list of display names of the outputs of `f_val`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. validation_in_fit: if true, then this method is invoked from within training iteration (for validation). In the case where `val_inputs` is a dataset, this flag indicates that its iterator and feed values are already created so should properly reuse resources. prepared_feed_values_from_dataset: if True, `inputs` is a list of feed tensors returned from `_prepare_feed_values` call on the validation dataset, so do not call it again on `inputs`. Should only be used for inline validation (i.e., only if `validation_in_fit` is also True). steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ # Backwards compatibility. if 'steps' in kwargs: steps_per_epoch = kwargs.pop('steps') if kwargs: raise TypeError('Unknown arguments: %s' % (kwargs,)) # In case we were passed a dataset, we extract symbolic tensors from it. reset_dataset_after_each_epoch = False input_iterator = None is_dataset = isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) # TODO(fchollet): consider moving `steps_per_epoch` inference to # _standardize_user_data and set reset_dataset_after_each_epoch as an # attribute on the dataset instance. if is_dataset: if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name) input_iterator = _get_iterator(inputs, model._distribution_strategy) if mode == ModeKeys.TRAIN: _print_train_info(inputs, val_inputs, steps_per_epoch, verbose) # Enter DistributionStrategy scope. if model._distribution_strategy: scope = distributed_training_utils.distributed_scope( strategy=model._distribution_strategy, learning_phase=(1 if mode == ModeKeys.TRAIN else 0)) scope.__enter__() # Get step function and loop type. f = _make_execution_function(model, mode) use_steps = is_dataset or steps_per_epoch is not None do_validation = val_inputs is not None # Convert Eager Tensors to NumPy arrays to support batching/shuffling. inputs, targets, sample_weights = training_utils. \ convert_eager_tensors_to_numpy((inputs, targets, sample_weights)) # Prepare input data. inputs = input_iterator or inputs if validation_in_fit and prepared_feed_values_from_dataset: # When invoking validation in training loop, avoid creating iterator and # list of feed values for the same validation dataset multiple times (which # essentially would call `iterator.get_next()` that slows down execution and # leads to OOM errors eventually. ins = inputs else: ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode) if not is_dataset: num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size, steps_per_epoch) else: num_samples_or_steps = steps_per_epoch # Prepare validation data. Hold references to the iterator and the input list # to properly reinitialize and reuse in multiple validation passes. val_iterator = None if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)): if validation_steps is None: # Because we pass an iterator feed instead of a Dataset to the eval # model_iteration() call, it will not trigger the dataset-input path # that determines the number of steps required. To avoid this issue, # set validation_steps here if validation_steps is None. validation_steps = training_utils.infer_steps_for_dataset( val_inputs, validation_steps, epochs=epochs, steps_name='validation_steps') val_iterator = _get_iterator(val_inputs, model._distribution_strategy) val_inputs = _prepare_feed_values( model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST) # Configure callbacks. count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. progbar = training_utils.get_progbar(model, count_mode) progbar.params = callbacks.params progbar.params['verbose'] = verbose # Find beforehand arrays that need sparse-to-dense conversion. if issparse is not None and not use_steps: indices_for_conversion_to_dense = [] feed = _get_model_feed(model, mode) for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)): if issparse(input_data) and not K.is_sparse(feed_tensor): indices_for_conversion_to_dense.append(i) # Select aggregation method. if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(use_steps, num_samples_or_steps) else: aggregator = training_utils.MetricsAggregator(use_steps, num_samples_or_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks.model.stop_training = False callbacks._call_begin_hook(mode) progbar.on_train_begin() for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch epoch_logs = {} model.reset_metrics() if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) progbar.on_epoch_begin(epoch, epoch_logs) if use_steps: # Step-wise loop. if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) progbar.on_batch_begin(step, batch_logs) # Get outputs. try: # `ins` can be callable in DistributionStrategy + eager case. actual_inputs = ins() if callable(ins) else ins batch_outs = f(actual_inputs) except errors.OutOfRangeError: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.num_samples_or_steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break if not isinstance(batch_outs, list): batch_outs = [batch_outs] if model._distribution_strategy: batch_outs = distributed_training_utils._per_device_aggregate_batch( batch_outs, model, mode) # Aggregate results. if step == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) progbar.on_batch_end(step, batch_logs) step += 1 if callbacks.model.stop_training: break else: # Sample-wise loop. index_array = np.arange(num_samples_or_steps) if shuffle == 'batch': index_array = training_utils.batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(num_samples_or_steps, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] # Slice into a batch. try: if ins and isinstance(ins[-1], int): # Do not slice the training phase flag. ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_arrays(ins, batch_ids) except TypeError: raise TypeError('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') # Sparse to dense conversion. if issparse is not None: for i in indices_for_conversion_to_dense: ins_batch[i] = ins_batch[i].toarray() # Callbacks batch_begin. batch_logs = {'batch': batch_index, 'size': len(batch_ids)} callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs) progbar.on_batch_begin(batch_index, batch_logs) # Get outputs. batch_outs = f(ins_batch) if not isinstance(batch_outs, list): batch_outs = [batch_outs] # Aggregate results. if batch_index == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs, batch_start, batch_end) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs) progbar.on_batch_end(batch_index, batch_logs) if callbacks.model.stop_training: break aggregator.finalize() results = aggregator.results epoch_logs = cbks.make_logs(model, epoch_logs, results, mode) if len(results) == 1: results = results[0] # Run the test loop every `validation_freq` epochs during training. if (do_validation and training_utils.should_run_validation(validation_freq, epoch) and not callbacks.model.stop_training): if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_results = model_iteration( model, val_inputs, targets=val_targets, sample_weights=val_sample_weights, batch_size=batch_size, steps_per_epoch=validation_steps, callbacks=callbacks, verbose=0, mode=ModeKeys.TEST, validation_in_fit=True, prepared_feed_values_from_dataset=(val_iterator is not None), steps_name='validation_steps') if not isinstance(val_results, list): val_results = [val_results] epoch_logs = cbks.make_logs( model, epoch_logs, val_results, mode, prefix='val_') if val_iterator and epoch < epochs - 1: _reinitialize_iterator(val_iterator, model._distribution_strategy) if mode == ModeKeys.TRAIN: # Epochs only apply to `fit`. callbacks.on_epoch_end(epoch, epoch_logs) progbar.on_epoch_end(epoch, epoch_logs) # Reinitialize dataset iterator for the next epoch. if reset_dataset_after_each_epoch and epoch < epochs - 1: _reinitialize_iterator(input_iterator, model._distribution_strategy) callbacks._call_end_hook(mode) if model._distribution_strategy: if model._compile_distribution: # TODO(priyag, psv): Copy back metrics to the original model as well? distributed_training_utils._copy_weights_to_original_model(model, mode) scope.__exit__(None, None, None) if mode == ModeKeys.TRAIN: return model.history return results
def fit(self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, **kwargs): batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) strategy = _get_distribution_strategy(model) batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size( strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN) dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) # Enter tf.distribute.Strategy scope. with strategy.scope(): training_data_adapter, validation_adapter = _process_training_inputs( model, x, y, batch_size=batch_size, sample_weights=sample_weight, class_weights=class_weight, validation_split=validation_split, steps_per_epoch=steps_per_epoch, shuffle=shuffle, validation_data=validation_data, validation_steps=validation_steps, distribution_strategy=strategy) total_samples = _get_total_number_of_samples(training_data_adapter) use_sample = total_samples is not None do_validation = (validation_adapter is not None) if not steps_per_epoch: steps_per_epoch = training_data_adapter.get_size() # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch)) training_context = TrainingContext() initial_epoch = model._maybe_load_initial_epoch_from_ckpt( initial_epoch, ModeKeys.TRAIN) training_dataset = training_data_adapter.get_dataset() # Raise an error if steps_per_epoch isn't specified but the dataset # is infinite. # TODO(scottzhu): This check should probably happen in the adapter training_utils.infer_steps_for_dataset( training_dataset, steps_per_epoch, steps_name='steps_per_epoch', epochs=0) training_dataset = strategy.experimental_distribute_dataset( training_dataset) _update_sample_weight_mode(model, ModeKeys.TRAIN, training_dataset) training_function = training_v2_utils._get_or_make_execution_function( model, ModeKeys.TRAIN) training_data_iter = None # Only recreate iterator when the data has a fixed length, which will be # fully consumed every epoch, or has a unknown length (dataset, generator) # and will be fully consumed (steps_per_epoch is None) recreate_training_iterator = (training_data_adapter.get_size() is not None or steps_per_epoch is None) if do_validation: if not validation_steps: validation_steps = validation_adapter.get_size() eval_function = training_v2_utils._get_or_make_execution_function( model, ModeKeys.TEST) eval_data_iter = None validation_dataset = validation_adapter.get_dataset() # Raise an error if validation_steps isn't specified but the validation # dataset is infinite. # TODO(scottzhu): This check should probably happen in the adapter training_utils.infer_steps_for_dataset( validation_dataset, validation_steps, steps_name='validation_steps', epochs=0) validation_dataset = strategy.experimental_distribute_dataset( validation_dataset) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=total_samples, count_mode='samples' if use_sample else 'steps', verbose=0, # Handle ProgBarLogger separately in this loop. mode=ModeKeys.TRAIN) with training_context.on_start(model, callbacks, use_sample, verbose, ModeKeys.TRAIN): # TODO(scottzhu): Handle TPUStrategy training loop for epoch in range(initial_epoch, epochs): if training_context.callbacks.model.stop_training: break # Training with training_context.on_epoch( epoch, ModeKeys.TRAIN) as epoch_logs: model.reset_metrics() if training_data_iter is None or recreate_training_iterator: if (training_data_iter is not None and distribution_strategy_context. has_strategy()): # TODO(kaftan): remove this when MultiDeviceIterator is a ## compositetensor (unless this is more efficient) training_data_iter._initializer # pylint: disable=pointless-statement else: training_data_iter = iter(training_dataset) training_result = run_one_epoch( model, training_data_iter, training_function, dataset_size=training_data_adapter.get_size(), batch_size=training_data_adapter.batch_size(), strategy=strategy, steps_per_epoch=steps_per_epoch, num_samples=total_samples, mode=ModeKeys.TRAIN, training_context=training_context, total_epochs=epochs) cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN) # Evaluation if (do_validation and training_utils.should_run_validation( validation_freq, epoch) and not callbacks.model.stop_training): if (eval_data_iter is not None and distribution_strategy_context. has_strategy()): # TODO(kaftan): remove this when MultiDeviceIterator is a ## compositetensor (unless this is more efficient) eval_data_iter._initializer # pylint: disable=pointless-statement else: eval_data_iter = iter(validation_dataset) val_total_samples = _get_total_number_of_samples( validation_adapter) eval_context = TrainingContext() with eval_context.on_start(model, callbacks, use_sample, verbose=0, mode=ModeKeys.TEST): with eval_context.on_epoch( epoch, ModeKeys.TEST): model.reset_metrics() eval_result = run_one_epoch( model, eval_data_iter, eval_function, dataset_size=validation_adapter. get_size(), batch_size=validation_adapter. batch_size(), strategy=strategy, steps_per_epoch=validation_steps, num_samples=val_total_samples, mode=ModeKeys.TEST, training_context=eval_context, total_epochs=1) cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST, prefix='val_') return model.history
def model_iteration(model, data, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None, validation_freq=1, class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=False, initial_epoch=0, mode=ModeKeys.TRAIN, batch_size=None, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or `(x, y, sample_weights)`) or a generator or `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset. steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. epochs: Number of times to iterate over the data. verbose: Verbosity mode, 0, 1 or 2. callbacks: List of callbacks to be called during training. validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or `(x, y, sample_weights)`) or a generator or `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset. validation_steps: Total number of steps (batches of samples) before declaring validation finished. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. class_weight: Dictionary mapping class indices to a weight for the class. max_queue_size: Integer. Maximum size for the generator queue. If unspecified, `max_queue_size` will default to 10. workers: Integer. Maximum number of processes to spin up when using process-based threading. If unspecified, `workers` will default to 1. If 0, will execute the generator on the main thread. use_multiprocessing: Boolean. If `True`, use process-based threading. If unspecified, `use_multiprocessing` will default to `False`. Note that because this implementation relies on multiprocessing, you should not pass non-picklable arguments to the generator as they can't be passed easily to children processes. shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch. Only used with instances of `Sequence` (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not `None`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run). mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. batch_size: Integer batch size or None if unknown. Will only be used if `data` is in NumPy/Tensor format. steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. `steps` is accepted as an alias for `steps_per_epoch`. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ if 'steps' in kwargs: steps_per_epoch = kwargs['steps'] # Determine the number of steps per epoch and whether we should reset the # dataset at the end of each epoch. reset_dataset_after_each_epoch = False original_dataset = None is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)) if is_dataset: original_dataset = data if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( data, steps_per_epoch, epochs=epochs, steps_name=steps_name) # Convert to a format that supports `next(generator)`. generator, steps_per_epoch = convert_to_generator_like( data, steps_per_epoch=steps_per_epoch, batch_size=batch_size, epochs=epochs - initial_epoch, shuffle=shuffle) do_validation = validation_data is not None is_sequence = isinstance(generator, data_utils.Sequence) _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers, steps_per_epoch, validation_data, validation_steps, mode, kwargs) batch_function = _make_execution_function(model, mode, class_weight=class_weight) # Create the queue for the generator. enqueuer = None if not is_dataset: generator, enqueuer = _make_enqueued_generator( generator, workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, shuffle=shuffle) num_samples_or_steps, use_steps = _get_num_samples_or_steps( data, steps_per_epoch) count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, samples=num_samples_or_steps, verbose=0, # Handle ProgBar as part of Callbacks once hooks are ready. mode=mode) # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. progbar = training_utils.get_progbar(model, count_mode) progbar.params = callbacks.params progbar.params['verbose'] = verbose if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(True, steps_per_epoch) else: aggregator = training_utils.MetricsAggregator(True, steps_per_epoch) should_set_learning_phase = context.executing_eagerly( ) and model.run_eagerly if should_set_learning_phase: old_learning_phase = backend.learning_phase() backend.set_eager_learning_phase(1 if mode == ModeKeys.TRAIN else 0) callbacks.model.stop_training = False callbacks._call_begin_hook(mode) progbar.on_train_begin() for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch. model.reset_metrics() epoch_logs = {} if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) progbar.on_epoch_begin(epoch, epoch_logs) if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_data = _get_next_batch(generator, mode) if batch_data is None: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.num_samples_or_steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps_per_epoch = training_utils.infer_steps_for_dataset( dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if (current_strategy.extended.steps_per_run != 1 and steps_per_epoch is None): raise ValueError('`steps_per_epoch` should be specified when calling ' '`fit` on the model with TPUStrategy when ' '`steps_per_run` != 1 .') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() def _per_device_fit_function(model): model._make_fit_function() return (model._fit_function.inputs, model._fit_function.outputs, model._fit_function.updates_op, model._fit_function.session_kwargs) out_labels = model.metrics_names or [] def step_fn(ctx, inputs): """Clones the model and calls make_fit_function.""" inputs, targets = inputs if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode, inputs=inputs, targets=targets) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs, targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args ) = current_strategy.extended.call_for_each_replica( _per_device_fit_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.TRAIN), )) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function(all_inputs, all_outputs, updates=all_updates, name='distributed_fit_function', **all_session_args) for label, output in zip(out_labels, combined_fn.outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn: # feed_dict, session kwargs, run options, run_metadata for now. These should # be handled appropriately return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) use_steps = steps_per_epoch is not None if use_steps: iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) else: iteration_value = current_strategy.extended.steps_per_run steps_per_run = K.variable(value=iteration_value, dtype='int32', name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model( model, mode) callbacks = cbks.configure_callbacks(callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. if use_steps: steps_to_run = ( [current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append(steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) else: target_steps = np.inf callbacks._call_begin_hook(mode) for epoch in range(initial_epoch, epochs): distributed_training_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] if use_steps else 1 batch_logs = { 'batch': step_index, 'size': 1, 'num_steps': step_count } callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: steps_per_run.load(step_count, K.get_session()) prev_step_count = step_count try: _, outputs = K.get_session().run([train_op, output_tensors]) except errors.OutOfRangeError: if use_steps: logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) else: target_steps = current_step logging.info( 'Dataset iterator ran out of data. Inferring the ' 'value of `steps_per_epoch` as %s .' % target_steps) distributed_training_utils.initialize_iterator( iterator, current_strategy) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation( validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. distributed_training_utils._copy_weights_to_original_model( model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def fit_distributed(model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1): """Fit loop for Distribution Strategies.""" distributed_training_utils.validate_callbacks(callbacks, model.optimizer) distributed_training_utils.validate_inputs( x, y, model._distribution_strategy) first_x_value = nest.flatten(x)[0] if isinstance(first_x_value, np.ndarray): # Until support for partial batch is implemented across all # functions and distribution strategy, we pass `mode` to selectively # relax the costraint to consume all the training samples. steps_per_epoch, batch_size = ( distributed_training_utils.get_input_params( model._distribution_strategy, first_x_value, steps_per_epoch, batch_size, mode=ModeKeys.TRAIN)) batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) steps_name = 'steps_per_epoch' if isinstance(x, dataset_ops.DatasetV2): steps_per_epoch = training_utils.infer_steps_for_dataset( x, steps_per_epoch, steps_name=steps_name) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, check_steps=True, steps_name=steps_name, steps=steps_per_epoch, validation_split=validation_split, shuffle=shuffle) val_dataset = None if validation_data: val_x, val_y, val_sample_weights = model._unpack_validation_data( validation_data) distributed_training_utils.validate_inputs( val_x, val_y, model._distribution_strategy) first_valx_value = nest.flatten(val_x)[0] if isinstance(first_valx_value, np.ndarray): validation_steps, _ = distributed_training_utils.get_input_params( model._distribution_strategy, first_valx_value, validation_steps, batch_size) steps_name = 'validation_steps' if isinstance(val_x, dataset_ops.DatasetV2): validation_steps = training_utils.infer_steps_for_dataset( val_x, validation_steps, steps_name=steps_name) val_dataset = model._distribution_standardize_user_data( val_x, val_y, sample_weight=val_sample_weights, class_weight=None, batch_size=batch_size, check_steps=True, steps_name=steps_name, steps=validation_steps, validation_split=validation_split, shuffle=shuffle) elif validation_split: raise ValueError('validation_split argument is not supported with ' 'distribution strategies.') if distributed_training_utils.is_tpu_strategy(model._distribution_strategy): return experimental_tpu_fit_loop( model, dataset, epochs=epochs, verbose=verbose, callbacks=callbacks, val_dataset=val_dataset, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=1) else: return training_arrays.fit_loop( model, dataset, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, val_inputs=val_dataset, shuffle=shuffle, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq)
def _model_iteration( self, model, mode, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None, **kwargs): batch_size = model._validate_or_infer_batch_size( batch_size, steps, x) strategy = _get_distribution_strategy(model) batch_size, steps = dist_utils.process_batch_and_step_size( strategy, x, batch_size, steps, mode) dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) # Enter tf.distribute.Strategy scope. with dist_utils.distributed_scope( strategy=strategy, learning_phase=0): adapter = _process_inputs( model, x, y, batch_size=batch_size, sample_weights=sample_weight, steps=steps, distribution_strategy=strategy) if not steps: steps = adapter.get_size() # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch)) training_context = TrainingContext() dataset = adapter.get_dataset() # Raise an error if `steps` isn't specified but the dataset # is infinite. # TODO(scottzhu): This check should probably happen in the adapter training_utils.infer_steps_for_dataset( dataset, steps, steps_name='steps', epochs=0) dataset = strategy.experimental_distribute_dataset(dataset) _update_sample_weight_mode(model, mode, dataset) execution_function = training_v2_utils._get_or_make_execution_function( model, mode) data_iterator = iter(dataset) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, batch_size=batch_size, epochs=1, steps_per_epoch=steps, samples=None, verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) with training_context.on_start(model, callbacks, verbose, mode): # TODO(scottzhu): Handle TPUStrategy training loop with training_context.on_epoch(0, mode) as epoch_logs: model.reset_metrics() result = run_one_epoch( model, data_iterator, execution_function, dataset_size=adapter.get_size(), strategy=strategy, steps_per_epoch=steps, mode=mode, training_context=training_context, total_epochs=1) cbks.make_logs(model, epoch_logs, result, mode) if len(result) == 1: result = result[0] return result
def model_iteration(model, inputs, targets=None, sample_weights=None, batch_size=None, epochs=1, verbose=1, callbacks=None, val_inputs=None, val_targets=None, val_sample_weights=None, shuffle=True, initial_epoch=0, steps_per_epoch=None, validation_steps=None, mode=ModeKeys.TRAIN, validation_in_fit=False, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. inputs: Either a list or dictionary of arrays, or a dataset instance. targets: List/dictionary of input arrays. sample_weights: Optional list of sample weight arrays. batch_size: Integer batch size or None if unknown. epochs: Number of times to iterate over the data verbose: Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training val_inputs: Either a list or dictionary of arrays, or a dataset instance. val_targets: List/dictionary of target arrays. val_sample_weights: Optional list of sample weight arrays. shuffle: Whether to shuffle the data at the beginning of each epoch concatenation of list the display names of the outputs of `f` and the list of display names of the outputs of `f_val`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. validation_in_fit: DEPRECATED: if true, then this method is invoked from within training iteration (for validation). In this case, do not copy weights when using a tf.distribute.Strategy. The input is deprecated as it is not required if the user creates a distributed model under the distribution strategy scope rather than passing it to compile. steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ # Backwards compatibility. if 'steps' in kwargs: steps_per_epoch = kwargs.pop('steps') if kwargs: raise TypeError('Unknown arguments: %s' % (kwargs, )) # In case we were passed a dataset, we extract symbolic tensors from it. reset_dataset_after_each_epoch = False original_dataset = None is_dataset = isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) # TODO(fchollet): consider moving `steps_per_epoch` inference to # _standardize_user_data and set reset_dataset_after_each_epoch as an # attribute on the dataset instance. if is_dataset: original_dataset = inputs if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name) if mode == ModeKeys.TRAIN: _print_train_info(inputs, val_inputs, steps_per_epoch, verbose) # Enter DistributionStrategy scope. if model._distribution_strategy: scope = model._distribution_strategy.scope() scope.__enter__() # Get step function and loop type. f = _make_execution_function(model, mode) use_steps = is_dataset or steps_per_epoch is not None do_validation = val_inputs is not None # Convert Eager Tensors to NumPy arrays to support batching/shuffling. inputs, targets, sample_weights = training_utils. \ convert_eager_tensors_to_numpy((inputs, targets, sample_weights)) # Prepare input data. ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode) if not is_dataset: num_samples_or_steps = _get_num_samples_or_steps( ins, batch_size, steps_per_epoch) else: num_samples_or_steps = steps_per_epoch # Configure callbacks. count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. progbar = training_utils.get_progbar(model, count_mode) progbar.params = callbacks.params progbar.params['verbose'] = verbose # Find beforehand arrays that need sparse-to-dense conversion. if issparse is not None and not use_steps: indices_for_conversion_to_dense = [] feed = _get_model_feed(model, mode) for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)): if issparse(input_data) and not K.is_sparse(feed_tensor): indices_for_conversion_to_dense.append(i) # Select aggregation method. if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(use_steps, num_samples_or_steps) else: aggregator = training_utils.MetricsAggregator(use_steps, num_samples_or_steps) if model._compile_distribution and not validation_in_fit: distributed_training_utils._copy_weights_to_distributed_model( model, model._distributed_model) callbacks.model.stop_training = False callbacks._call_begin_hook(mode) progbar.on_train_begin() for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch epoch_logs = {} model.reset_metrics() if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) progbar.on_epoch_begin(epoch, epoch_logs) if use_steps: # Step-wise loop. if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) progbar.on_batch_begin(step, batch_logs) # Get outputs. try: # `ins` can be callable in DistributionStrategy + eager case. actual_inputs = ins() if callable(ins) else ins batch_outs = f(actual_inputs) except errors.OutOfRangeError: if original_dataset is None: # We ran out of batches while the user passed an iterator (legacy). logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) callbacks.model.stop_training = True else: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. if step > 0: steps_per_epoch = step aggregator.num_samples_or_steps = steps_per_epoch progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch break if not isinstance(batch_outs, list): batch_outs = [batch_outs] if model._distribution_strategy: batch_outs = distributed_training_utils._per_device_aggregate_batch( batch_outs, model, mode) # Aggregate results. if step == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) progbar.on_batch_end(step, batch_logs) step += 1 if callbacks.model.stop_training: break else: # Sample-wise loop. index_array = np.arange(num_samples_or_steps) if shuffle == 'batch': index_array = training_utils.batch_shuffle( index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(num_samples_or_steps, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] # Slice into a batch. try: if ins and isinstance(ins[-1], int): # Do not slice the training phase flag. ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_arrays(ins, batch_ids) except TypeError: raise TypeError('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') # Sparse to dense conversion. if issparse is not None: for i in indices_for_conversion_to_dense: ins_batch[i] = ins_batch[i].toarray() # Callbacks batch_begin. batch_logs = {'batch': batch_index, 'size': len(batch_ids)} callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs) progbar.on_batch_begin(batch_index, batch_logs) # Get outputs. batch_outs = f(ins_batch) if not isinstance(batch_outs, list): batch_outs = [batch_outs] # Aggregate results. if batch_index == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs, batch_start, batch_end) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs) progbar.on_batch_end(batch_index, batch_logs) if callbacks.model.stop_training: break aggregator.finalize() results = aggregator.results epoch_logs = cbks.make_logs(model, epoch_logs, results, mode) if len(results) == 1: results = results[0] # Run the test loop every epoch during training. if do_validation and not callbacks.model.stop_training: val_results = model_iteration(model, val_inputs, targets=val_targets, sample_weights=val_sample_weights, batch_size=batch_size, steps_per_epoch=validation_steps, callbacks=callbacks, verbose=0, mode=ModeKeys.TEST, validation_in_fit=True, steps_name='validation_steps') if not isinstance(val_results, list): val_results = [val_results] epoch_logs = cbks.make_logs(model, epoch_logs, val_results, mode, prefix='val_') if mode == ModeKeys.TRAIN: # Epochs only apply to `fit`. callbacks.on_epoch_end(epoch, epoch_logs) progbar.on_epoch_end(epoch, epoch_logs) # Recreate dataset iterator for the next epoch. if reset_dataset_after_each_epoch and epoch < epochs - 1: ins = _prepare_feed_values(model, original_dataset, None, None, mode) callbacks._call_end_hook(mode) if model._distribution_strategy: if model._compile_distribution and not validation_in_fit: # TODO(priyag, psv): Copy back metrics to the original model as well? distributed_training_utils._copy_weights_to_original_model( model, model._distributed_model, mode) scope.__exit__(None, None, None) if mode == ModeKeys.TRAIN: return model.history return results
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ mode = ModeKeys.PREDICT steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') dataset_fully_shaped = (distributed_training_utils. is_dataset_shape_fully_defined(dataset)) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.apply(batching.unbatch()) # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() out_labels = model.output_names step_fn = _make_step_fn(model, ModeKeys.PREDICT, current_strategy, out_labels) # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=mode) callbacks._call_begin_hook(mode) # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] if steps is not None: target_steps = steps else: target_steps = np.inf current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.batch_get_value([predict_op, output_tensors]) except errors.OutOfRangeError: if steps is not None: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) else: warning_msg = 'Number of steps ran: {} steps'.format(current_step) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) break # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose >= 1: progbar.update(current_step + 1) current_step += 1 callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() out_labels = model.metrics_names step_fn = _make_step_fn(model, ModeKeys.TEST, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) outs = [0.] * len(model.metrics_names) if steps is not None: target_steps = steps else: target_steps = np.inf current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.batch_get_value([test_op, output_tensors]) except errors.OutOfRangeError: if steps is not None: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) else: warning_msg = 'Number of steps ran: {} steps'.format(current_step) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) target_steps = current_step break for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose >= 1: progbar.update(current_step + 1) current_step += 1 callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (target_steps) if len(outs) == 1: return outs[0] return outs
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU tf.distribute.Strategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() out_labels = model.metrics_names def _test_step_fn(inputs): """A fn that returns output of single test step.""" if isinstance(inputs, (tuple, list)) and len(inputs) == 2: inputs, targets = inputs else: targets = None (distribution_strategy_context.get_replica_context().merge_call( _build_model, args=(model, mode, inputs, targets))) (_, outputs, updates, _) = ( _per_replica_execution_function( distributed_training_utils.get_distributed_model(model, mode), mode)) with ops.control_dependencies([updates]): return outputs test_input_data = iterator.get_next() per_replica_outputs = current_strategy.experimental_run_v2( _test_step_fn, args=(test_input_data,)) output_tensors = {} for label, output in zip(out_labels, per_replica_outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN output_tensors[label] = current_strategy.reduce(reduce_op, output, axis=None) test_op = control_flow_ops.group(list(output_tensors.values())) if verbose >= 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) outs = [0.] * len(model.metrics_names) if steps is not None: target_steps = steps else: raise ValueError('Number of steps could not be infered from the data, ' 'please pass the steps argument.') current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.batch_get_value([test_op, output_tensors]) except errors.OutOfRangeError: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) target_steps = current_step break for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose == 1: progbar.update(current_step + 1) current_step += 1 if verbose >= 1: # Progress bar finishes at the end. progbar.update(target_steps) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (target_steps) if len(outs) == 1: return outs[0] return outs
def _model_iteration( self, model, mode, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False, **kwargs): batch_size = model._validate_or_infer_batch_size( batch_size, steps, x) strategy = _get_distribution_strategy(model) batch_size, steps = dist_utils.process_batch_and_step_size( strategy, x, batch_size, steps, mode) dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) # Enter tf.distribute.Strategy scope. with strategy.scope(): adapter = _process_inputs( model, mode, x, y, batch_size=batch_size, sample_weights=sample_weight, steps=steps, distribution_strategy=strategy, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing) total_samples = _get_total_number_of_samples(adapter) use_sample = total_samples is not None dataset = adapter.get_dataset() if not steps: # Raise an error if `steps` isn't specified but the dataset # is infinite. steps = adapter.get_size() or training_utils.infer_steps_for_dataset( model, dataset, steps, steps_name='steps') # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch)) training_context = TrainingContext() dataset = strategy.experimental_distribute_dataset(dataset) execution_function = training_v2_utils._get_or_make_execution_function( model, mode) data_iterator = iter(dataset) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, batch_size=batch_size, epochs=1, steps_per_epoch=steps, samples=use_sample, count_mode='samples' if use_sample else 'steps', verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) with training_context.on_start( model, callbacks, use_sample, verbose, mode): with training_context.on_epoch(0, mode) as epoch_logs: model.reset_metrics() result = run_one_epoch( model, data_iterator, execution_function, dataset_size=adapter.get_size(), batch_size=adapter.batch_size(), strategy=strategy, steps_per_epoch=steps, num_samples=total_samples, mode=mode, training_context=training_context, total_epochs=1) cbks.make_logs(model, epoch_logs, result, mode) if len(result) == 1: result = result[0] return result
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU tf.distribute.Strategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ mode = ModeKeys.PREDICT steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') dataset_fully_shaped = (distributed_training_utils. is_dataset_shape_fully_defined(dataset)) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.apply(batching.unbatch()) # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _predict_step_fn(inputs): """A fn that returns output of single prediction step.""" (distribution_strategy_context.get_replica_context().merge_call( _build_model, args=(model, mode, inputs))) (_, outputs, updates, _) = ( _per_replica_execution_function( distributed_training_utils.get_distributed_model(model, mode), mode)) with ops.control_dependencies([updates]): return outputs # TODO(hongjunchoi): When numpy array is passed as an input to `predict()` # use numpy arrays directly to avoid cumulating unnecessary input pipeline # ops. predict_input_data = iterator.get_next() per_replica_outputs = current_strategy.experimental_run_v2( _predict_step_fn, args=(predict_input_data,)) output_tensors = distributed_training_utils.flatten_per_replica_values( current_strategy, per_replica_outputs) if verbose >= 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=mode) callbacks._call_begin_hook(mode) # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. num_model_outputs = len(model.output_names) unconcatenated_outs = [[] for _ in range(num_model_outputs)] if steps is not None: target_steps = steps else: raise ValueError('Number of steps could not be infered from the data, ' 'please pass the steps argument.') current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: predict_ops = control_flow_ops.group(output_tensors) _, batch_outs = K.batch_get_value([predict_ops, output_tensors]) except errors.OutOfRangeError: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) break # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i in range(num_model_outputs): output_start_index = i * current_strategy.num_replicas_in_sync output_end_index = ( output_start_index + current_strategy.num_replicas_in_sync) single_model_output = batch_outs[output_start_index:output_end_index] unconcatenated_outs[i].extend(single_model_output) batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose == 1: progbar.update(current_step + 1) current_step += 1 if verbose >= 1: # Progress bar finishes at the end. progbar.update(current_step) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ mode = ModeKeys.PREDICT steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') dataset_fully_shaped = (distributed_training_utils. is_dataset_shape_fully_defined(dataset)) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.apply(batching.unbatch()) # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _predict_step_fn(inputs): """A fn that returns output of single prediction step.""" (distribution_strategy_context.get_replica_context().merge_call( _build_model, args=(model, mode, inputs))) (_, outputs, updates, _) = ( _per_device_execution_function( distributed_training_utils.get_distributed_model(model, mode), mode)) with ops.control_dependencies([updates]): return outputs # TODO(hongjunchoi): When numpy array is passed as an input to `predict()` # use numpy arrays directly to avoid cumulating unnecessary input pipeline # ops. predict_input_data = iterator.get_next() per_replica_outputs = current_strategy.experimental_run_v2( _predict_step_fn, args=(predict_input_data,)) output_tensors = distributed_training_utils.flatten_perdevice_values( current_strategy, per_replica_outputs) if verbose >= 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=mode) callbacks._call_begin_hook(mode) # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. num_model_outputs = len(model.output_names) unconcatenated_outs = [[] for _ in range(num_model_outputs)] if steps is not None: target_steps = steps else: raise ValueError('Number of steps could not be infered from the data, ' 'please pass the steps argument.') current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: predict_ops = control_flow_ops.group(output_tensors) _, batch_outs = K.batch_get_value([predict_ops, output_tensors]) except errors.OutOfRangeError: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) break # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i in range(num_model_outputs): output_start_index = i * current_strategy.num_replicas_in_sync output_end_index = ( output_start_index + current_strategy.num_replicas_in_sync) single_model_output = batch_outs[output_start_index:output_end_index] unconcatenated_outs[i].extend(single_model_output) batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose == 1: progbar.update(current_step + 1) current_step += 1 if verbose >= 1: # Progress bar finishes at the end. progbar.update(current_step) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def model_iteration(model, data, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None, validation_freq=1, class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=False, initial_epoch=0, mode=ModeKeys.TRAIN, batch_size=None, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or `(x, y, sample_weights)`) or a generator or `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset. steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. epochs: Number of times to iterate over the data. verbose: 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. Note that the progress bar is not particularly useful when logged to a file, so verbose=2 is recommended when not running interactively (eg, in a production environment). callbacks: List of callbacks to be called during training. validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or `(x, y, sample_weights)`) or a generator or `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset. validation_steps: Total number of steps (batches of samples) before declaring validation finished. validation_freq: Only relevant if validation data is provided. Integer or `collections.abc.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. class_weight: Dictionary mapping class indices to a weight for the class. max_queue_size: Integer. Maximum size for the generator queue. If unspecified, `max_queue_size` will default to 10. workers: Integer. Maximum number of processes to spin up when using process-based threading. If unspecified, `workers` will default to 1. If 0, will execute the generator on the main thread. use_multiprocessing: Boolean. If `True`, use process-based threading. If unspecified, `use_multiprocessing` will default to `False`. Note that because this implementation relies on multiprocessing, you should not pass non-picklable arguments to the generator as they can't be passed easily to children processes. shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch. Only used with instances of `Sequence` (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not `None`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run). mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. batch_size: Integer batch size or None if unknown. Will only be used if `data` is in NumPy/Tensor format. steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. `steps` is accepted as an alias for `steps_per_epoch`. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ if 'steps' in kwargs: steps_per_epoch = kwargs['steps'] # Determine the number of steps per epoch and whether we should reset the # dataset at the end of each epoch. reset_dataset_after_each_epoch = False original_dataset = None is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)) if is_dataset: original_dataset = data if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( model, data, steps_per_epoch, epochs=epochs, steps_name=steps_name) # Convert to a format that supports `next(generator)`. generator, steps_per_epoch = convert_to_generator_like( data, steps_per_epoch=steps_per_epoch, batch_size=batch_size, epochs=epochs - initial_epoch, shuffle=shuffle) do_validation = validation_data is not None is_sequence = isinstance(generator, data_utils.Sequence) _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers, steps_per_epoch, validation_data, validation_steps, mode, kwargs) batch_function = _make_execution_function(model, mode, class_weight=class_weight) # Create the queue for the generator. enqueuer = None if not is_dataset: generator, enqueuer = _make_enqueued_generator( generator, workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, shuffle=shuffle) num_samples_or_steps, use_steps = _get_num_samples_or_steps( data, steps_per_epoch) count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks(callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, samples=num_samples_or_steps, count_mode=count_mode, verbose=verbose, mode=mode) if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(True, steps=steps_per_epoch) else: aggregator = training_utils.MetricsAggregator(True, steps=steps_per_epoch) should_set_learning_phase = context.executing_eagerly( ) and model.run_eagerly if should_set_learning_phase: learning_phase_scope = backend.eager_learning_phase_scope( 1 if mode == ModeKeys.TRAIN else 0) learning_phase_scope.__enter__() callbacks.model.stop_training = False callbacks._call_begin_hook(mode) initial_epoch = model._maybe_load_initial_epoch_from_ckpt( initial_epoch, mode) for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch. model.reset_metrics() epoch_logs = {} if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_data = _get_next_batch(generator) if batch_data is None: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.steps = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break # `batch_size` used for validation data if validation # data is NumPy/EagerTensors. batch_size = int(nest.flatten(batch_data)[0].shape[0]) # Callbacks batch begin. batch_logs = {'batch': step, 'size': batch_size} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) is_deferred = not model._is_compiled batch_outs = batch_function(*batch_data) if not isinstance(batch_outs, list): batch_outs = [batch_outs] if step == 0: aggregator.create(batch_outs) if is_deferred: # Set callbacks params. We do this here when model is compiled only # in the first iteration of this loop (deferred build scenario). cbks.set_callback_parameters( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=verbose, mode=mode) # Aggregate results. aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) step += 1 if callbacks.model.stop_training: break aggregator.finalize() results = aggregator.results epoch_logs = cbks.make_logs(model, epoch_logs, results, mode) if len(results) == 1: results = results[0] # Run the test loop every epoch during training. if (do_validation and training_utils.should_run_validation( validation_freq, epoch) and not callbacks.model.stop_training): val_results = model_iteration( model, validation_data, steps_per_epoch=validation_steps, batch_size=batch_size, class_weight=class_weight, workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, callbacks=callbacks, verbose=verbose, mode=ModeKeys.TEST, steps_name='validation_steps') if not isinstance(val_results, list): val_results = [val_results] epoch_logs = cbks.make_logs(model, epoch_logs, val_results, mode, prefix='val_') if mode == ModeKeys.TRAIN: # Epochs only apply to `fit`. callbacks.on_epoch_end(epoch, epoch_logs) # Recreate dataset iterator for the next epoch. if reset_dataset_after_each_epoch and epoch < epochs - 1: generator = dataset_ops.make_one_shot_iterator(original_dataset) model._successful_loop_finish = True callbacks._call_end_hook(mode) if enqueuer is not None: enqueuer.stop() if should_set_learning_phase: learning_phase_scope.__exit__(None, None, None) if mode == ModeKeys.TRAIN: return model.history return results
def model_iteration(model, inputs, targets=None, sample_weights=None, batch_size=None, epochs=1, verbose=1, callbacks=None, val_inputs=None, val_targets=None, val_sample_weights=None, shuffle=True, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, mode=ModeKeys.TRAIN, validation_in_fit=False, prepared_feed_values_from_dataset=False, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. inputs: Either a list or dictionary of arrays, or a dataset instance. targets: List/dictionary of input arrays. sample_weights: Optional list of sample weight arrays. batch_size: Integer batch size or None if unknown. epochs: Number of times to iterate over the data verbose: Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training val_inputs: Either a list or dictionary of arrays, or a dataset instance. val_targets: List/dictionary of target arrays. val_sample_weights: Optional list of sample weight arrays. shuffle: Whether to shuffle the data at the beginning of each epoch concatenation of list the display names of the outputs of `f` and the list of display names of the outputs of `f_val`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. validation_in_fit: if true, then this method is invoked from within training iteration (for validation). In the case where `val_inputs` is a dataset, this flag indicates that its iterator and feed values are already created so should properly reuse resources. prepared_feed_values_from_dataset: if True, `inputs` is a list of feed tensors returned from `_prepare_feed_values` call on the validation dataset, so do not call it again on `inputs`. Should only be used for inline validation (i.e., only if `validation_in_fit` is also True). steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ # Backwards compatibility. if 'steps' in kwargs: steps_per_epoch = kwargs.pop('steps') if kwargs: raise TypeError('Unknown arguments: %s' % (kwargs,)) # In case we were passed a dataset, we extract symbolic tensors from it. reset_dataset_after_each_epoch = False input_iterator = None is_dataset = isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) # TODO(fchollet): consider moving `steps_per_epoch` inference to # _standardize_user_data and set reset_dataset_after_each_epoch as an # attribute on the dataset instance. if is_dataset: if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name) input_iterator = _get_iterator(inputs, model._distribution_strategy) if mode == ModeKeys.TRAIN: _print_train_info(inputs, val_inputs, steps_per_epoch, verbose) # Enter DistributionStrategy scope. if model._distribution_strategy: scope = distributed_training_utils.distributed_scope( strategy=model._distribution_strategy, learning_phase=(1 if mode == ModeKeys.TRAIN else 0)) scope.__enter__() # Get step function and loop type. f = _make_execution_function(model, mode) use_steps = is_dataset or steps_per_epoch is not None do_validation = val_inputs is not None # Convert Eager Tensors to NumPy arrays to support batching/shuffling. inputs, targets, sample_weights = training_utils. \ convert_eager_tensors_to_numpy((inputs, targets, sample_weights)) # Prepare input data. inputs = input_iterator or inputs if validation_in_fit and prepared_feed_values_from_dataset: # When invoking validation in training loop, avoid creating iterator and # list of feed values for the same validation dataset multiple times (which # essentially would call `iterator.get_next()` that slows down execution and # leads to OOM errors eventually. ins = inputs else: ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode) if not is_dataset: num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size, steps_per_epoch) else: num_samples_or_steps = steps_per_epoch # Prepare validation data. Hold references to the iterator and the input list # to properly reinitialize and reuse in multiple validation passes. val_iterator = None if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)): if validation_steps is None: # Because we pass an iterator feed instead of a Dataset to the eval # model_iteration() call, it will not trigger the dataset-input path # that determines the number of steps required. To avoid this issue, # set validation_steps here if validation_steps is None. validation_steps = training_utils.infer_steps_for_dataset( val_inputs, validation_steps, epochs=epochs, steps_name='validation_steps') val_iterator = _get_iterator(val_inputs, model._distribution_strategy) val_inputs = _prepare_feed_values( model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST) # Configure callbacks. count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, val_inputs=val_inputs, val_targets=val_targets, val_sample_weights=val_sample_weights, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. progbar = training_utils.get_progbar(model, count_mode) progbar.params = callbacks.params progbar.params['verbose'] = verbose # Find beforehand arrays that need sparse-to-dense conversion. if issparse is not None and not use_steps: indices_for_conversion_to_dense = [] feed = _get_model_feed(model, mode) for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)): if issparse(input_data) and not K.is_sparse(feed_tensor): indices_for_conversion_to_dense.append(i) # Select aggregation method. if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(use_steps, num_samples_or_steps) else: aggregator = training_utils.MetricsAggregator(use_steps, num_samples_or_steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) callbacks.model.stop_training = False callbacks._call_begin_hook(mode) progbar.on_train_begin() for epoch in range(initial_epoch, epochs): # Reset stateful metrics for m in model.stateful_metric_functions: m.reset_states() # Update callbacks callbacks.on_epoch_begin(epoch) epoch_logs = {} model.reset_metrics() if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) progbar.on_epoch_begin(epoch, epoch_logs) if use_steps: # Step-wise loop. if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) progbar.on_batch_begin(step, batch_logs) # Get outputs. try: outs = f(ins) except errors.OutOfRangeError: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.num_samples_or_steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break if not isinstance(batch_outs, list): batch_outs = [batch_outs] if model._distribution_strategy: batch_outs = distributed_training_utils._per_device_aggregate_batch( batch_outs, model, mode) # Aggregate results. if step == 0: aggregator.create(batch_outs) aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) progbar.on_batch_end(step, batch_logs) step += 1 callbacks.on_batch_end(step_index, batch_logs) if callbacks.model.stop_training: break if do_validation: val_outs = test_loop( model, val_inputs, val_targets, sample_weights=val_sample_weights, steps=validation_steps, verbose=0) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for l, o in zip(model.metrics_names, val_outs): epoch_logs['val_' + l] = o else: # Sample-wise fit loop. if shuffle == 'batch': index_array = training_utils.batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(num_train_samples, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] try: if isinstance(ins[-1], int): # Do not slice the training phase flag. ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_arrays(ins, batch_ids) except TypeError: raise TypeError('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') batch_logs = {} batch_logs['batch'] = batch_index batch_logs['size'] = len(batch_ids) callbacks.on_batch_begin(batch_index, batch_logs) for i in indices_for_conversion_to_dense: ins_batch[i] = ins_batch[i].toarray() outs = f(ins_batch) if not isinstance(outs, list): outs = [outs] for l, o in zip(model.metrics_names, outs): batch_logs[l] = o callbacks.on_batch_end(batch_index, batch_logs) if callbacks.model.stop_training: break if batch_index == len(batches) - 1: # Last batch. if do_validation: val_outs = test_loop( model, val_inputs, val_targets, sample_weights=val_sample_weights, batch_size=batch_size, verbose=0) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for l, o in zip(model.metrics_names, val_outs): epoch_logs['val_' + l] = o callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks.on_train_end() return model.history
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _per_device_eval_function(model): model._make_eval_function() return (model._eval_function.inputs, model._eval_function.outputs, model._eval_function.updates_op, model._eval_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_eval_function.""" inputs, targets = inputs if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode=mode, inputs=inputs, targets=targets) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs, targets) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args ) = current_strategy.extended.call_for_each_replica( _per_device_eval_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.TEST), )) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function(all_inputs, all_outputs, updates=all_updates, name='distributed_test_function', **all_session_args) for label, output in zip(model.metrics_names, combined_fn.outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN ctx.set_last_step_output(label, output, reduce_op) return combined_fn.updates_op # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for name in model.metrics_names[1:]: tensor = model._all_stateful_metrics_tensors[name] initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype) # TODO(priyag): Use steps_per_run when we use new metrics as they will # allow handling metric computation at each step using variables. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) test_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model( model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks(callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) outs = [0.] * len(model.metrics_names) if steps is not None: target_steps = steps else: target_steps = np.inf current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.get_session().run([test_op, output_tensors]) except errors.OutOfRangeError: if steps is not None: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) else: warning_msg = 'Number of steps ran: {} steps'.format( current_step) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) target_steps = current_step break for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose >= 1: progbar.update(current_step + 1) current_step += 1 callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (target_steps) if len(outs) == 1: return outs[0] return outs
def model_iteration(model, data, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None, validation_freq=1, train_class_weight=None, val_class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=False, initial_epoch=0, mode=ModeKeys.TRAIN, batch_size=None, steps_name='steps', **kwargs): if 'steps' in kwargs: steps_per_epoch = kwargs['steps'] # Determine the number of steps per epoch and whether we should reset the # dataset at the end of each epoch. reset_dataset_after_each_epoch = False original_dataset = None is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)) if is_dataset: original_dataset = data if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( data, steps_per_epoch, epochs=epochs, steps_name=steps_name) # Convert to a format that supports `next(generator)`. generator, steps_per_epoch = convert_to_generator_like( data, steps_per_epoch=steps_per_epoch, batch_size=batch_size, epochs=epochs - initial_epoch, shuffle=shuffle) do_validation = validation_data is not None is_sequence = isinstance(generator, data_utils.Sequence) _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers, steps_per_epoch, validation_data, validation_steps, mode, kwargs) # print(train_class_weight, 'before make execution') ###################################################################### batch_function = _make_execution_function( model, mode, train_class_weight=train_class_weight, val_class_weight=val_class_weight) ###################################################################### # Create the queue for the generator. enqueuer = None if not is_dataset: generator, enqueuer = _make_enqueued_generator( generator, workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, shuffle=shuffle) num_samples_or_steps, use_steps = _get_num_samples_or_steps( data, steps_per_epoch) count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks(callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, samples=num_samples_or_steps, verbose=verbose, count_mode=count_mode, mode=mode) if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(True, steps=steps_per_epoch) else: aggregator = training_utils.MetricsAggregator(True, steps=steps_per_epoch) should_set_learning_phase = context.executing_eagerly( ) and model.run_eagerly if should_set_learning_phase: learning_phase_scope = backend.eager_learning_phase_scope( 1 if mode == ModeKeys.TRAIN else 0) learning_phase_scope.__enter__() callbacks.model.stop_training = False callbacks._call_begin_hook(mode) print(initial_epoch, mode) # TODO: mode is a bug? # https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/keras/engine/training.py initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch) for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch. model.reset_metrics() epoch_logs = {} if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_data = _get_next_batch(generator) if batch_data is None: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.steps = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break # `batch_size` used for validation data if validation # data is NumPy/EagerTensors. batch_size = int(nest.flatten(batch_data)[0].shape[0]) # Callbacks batch begin. batch_logs = {'batch': step, 'size': batch_size} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) is_deferred = not model._is_compiled ###################################################### batch_outs = batch_function(*batch_data) ###################################################### if not isinstance(batch_outs, list): batch_outs = [batch_outs] if step == 0: aggregator.create(batch_outs) if is_deferred: # Set callbacks params. We do this here when model is compiled only # in the first iteration of this loop (deferred build scenario). cbks.set_callback_parameters( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=verbose, mode=mode) # Aggregate results. aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) step += 1 if callbacks.model.stop_training: break aggregator.finalize() results = aggregator.results epoch_logs = cbks.make_logs(model, epoch_logs, results, mode) if len(results) == 1: results = results[0] # Run the test loop every epoch during training. if (do_validation and training_utils.should_run_validation( validation_freq, epoch) and not callbacks.model.stop_training): ############################################################################ val_results = model_iteration( model, validation_data, steps_per_epoch=validation_steps, batch_size=batch_size, val_class_weight=val_class_weight, ######## HACK workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, callbacks=callbacks, verbose=0, mode=ModeKeys.TEST, steps_name='validation_steps') ############################################################################ if not isinstance(val_results, list): val_results = [val_results] epoch_logs = cbks.make_logs(model, epoch_logs, val_results, mode, prefix='val_') if mode == ModeKeys.TRAIN: # Epochs only apply to `fit`. callbacks.on_epoch_end(epoch, epoch_logs) # Recreate dataset iterator for the next epoch. if reset_dataset_after_each_epoch and epoch < epochs - 1: generator = dataset_ops.make_one_shot_iterator(original_dataset) callbacks._call_end_hook(mode) if enqueuer is not None: enqueuer.stop() if should_set_learning_phase: learning_phase_scope.__exit__(None, None, None) if mode == ModeKeys.TRAIN: return model.history return results
def fit( self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False, **kwargs): batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) strategy = _get_distribution_strategy(model) batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size( strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN, validation_split=validation_split) dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) # Enter tf.distribute.Strategy scope. with strategy.scope(): training_data_adapter, validation_adapter = _process_training_inputs( model, x, y, batch_size=batch_size, epochs=epochs, sample_weights=sample_weight, class_weights=class_weight, validation_split=validation_split, steps_per_epoch=steps_per_epoch, shuffle=shuffle, validation_data=validation_data, validation_steps=validation_steps, distribution_strategy=strategy, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing) total_samples = _get_total_number_of_samples(training_data_adapter) use_sample = total_samples is not None do_validation = (validation_adapter is not None) recreate_training_iterator = ( training_data_adapter.should_recreate_iterator(steps_per_epoch)) if not steps_per_epoch: # TODO(b/139762795): Add step inference for when steps is None to # prevent end of sequence warning message. steps_per_epoch = training_data_adapter.get_size() # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch)) training_context = TrainingContext() training_dataset = training_data_adapter.get_dataset() # Raise an error if steps_per_epoch isn't specified but the dataset # is infinite. # TODO(scottzhu): This check should probably happen in the adapter inferred_steps = training_utils.infer_steps_for_dataset( model, training_dataset, steps_per_epoch, steps_name='steps_per_epoch', epochs=0) steps_per_epoch = ( inferred_steps if steps_per_epoch is None else steps_per_epoch) training_dataset = strategy.experimental_distribute_dataset( training_dataset) training_function = training_v2_utils._get_or_make_execution_function( model, ModeKeys.TRAIN) training_data_iter = None if do_validation: validation_dataset = validation_adapter.get_dataset() if not validation_steps: # Raise an error if validation_steps isn't specified but the # validation dataset is infinite. validation_steps = ( validation_adapter.get_size() or training_utils.infer_steps_for_dataset( model, validation_dataset, validation_steps, steps_name='validation_steps')) eval_function = training_v2_utils._get_or_make_execution_function( model, ModeKeys.TEST) eval_data_iter = None validation_dataset = strategy.experimental_distribute_dataset( validation_dataset) val_total_samples = _get_total_number_of_samples(validation_adapter) else: val_total_samples = None if verbose and (total_samples or steps_per_epoch): _print_train_info(total_samples, steps_per_epoch, val_total_samples, validation_steps) training_callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=total_samples or steps_per_epoch, count_mode='samples' if use_sample else 'steps', verbose=0, # Handle ProgBarLogger separately in this loop. mode=ModeKeys.TRAIN) with training_context.on_start(model, training_callbacks, use_sample, verbose, ModeKeys.TRAIN): initial_epoch = model._maybe_load_initial_epoch_from_ckpt( initial_epoch, ModeKeys.TRAIN) for epoch in range(initial_epoch, epochs): if training_context.callbacks.model.stop_training: break # Training with training_context.on_epoch(epoch, ModeKeys.TRAIN) as epoch_logs: model.reset_metrics() if training_data_iter is None or recreate_training_iterator: if (training_data_iter is not None and distribution_strategy_context.has_strategy()): # TODO(kaftan): remove this when MultiDeviceIterator is a ## compositetensor (unless this is more efficient) training_data_iter._initializer # pylint: disable=pointless-statement else: training_data_iter = iter(training_dataset) training_result = run_one_epoch( model, training_data_iter, training_function, dataset_size=training_data_adapter.get_size(), batch_size=training_data_adapter.batch_size(), strategy=strategy, steps_per_epoch=steps_per_epoch, num_samples=total_samples, mode=ModeKeys.TRAIN, training_context=training_context, total_epochs=epochs) cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN) # In the case of steps_per_epoch = None, the final cardinality will # be determined when the inputs are fully consumed (eg dataset or # generator). Update the steps_per_epoch to the new value. if (steps_per_epoch is None and training_context.progbar.progbar.target is not None): steps_per_epoch = training_context.progbar.progbar.target # Evaluation if (do_validation and training_utils.should_run_validation(validation_freq, epoch) and not training_callbacks.model.stop_training): if (eval_data_iter is not None and distribution_strategy_context.has_strategy()): # TODO(kaftan): remove this when MultiDeviceIterator is a ## compositetensor (unless this is more efficient) eval_data_iter._initializer # pylint: disable=pointless-statement else: eval_data_iter = iter(validation_dataset) validation_callbacks = cbks.configure_callbacks( training_callbacks, model, batch_size=batch_size, epochs=1, steps_per_epoch=validation_steps, samples=val_total_samples or validation_steps, count_mode='samples' if use_sample else 'steps', verbose=0, # Handle ProgBarLogger separately in this loop. mode=ModeKeys.TEST) eval_context = TrainingContext() with eval_context.on_start( model, validation_callbacks, use_sample, verbose=0, mode=ModeKeys.TEST): with eval_context.on_epoch(epoch, ModeKeys.TEST): model.reset_metrics() eval_result = run_one_epoch( model, eval_data_iter, eval_function, dataset_size=validation_adapter.get_size(), batch_size=validation_adapter.batch_size(), strategy=strategy, steps_per_epoch=validation_steps, num_samples=val_total_samples, mode=ModeKeys.TEST, training_context=eval_context, total_epochs=1) cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST, prefix='val_') return model.history
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ mode = ModeKeys.PREDICT steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') dataset_fully_shaped = ( distributed_training_utils.is_dataset_shape_fully_defined(dataset)) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes( dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce( padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.apply(batching.unbatch()) # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _per_device_predict_function(model): model._make_predict_function() return (model.predict_function.inputs, model.predict_function.outputs, model.predict_function.updates_op, model.predict_function.session_kwargs) def step_fn(ctx, inputs): """Clones the model and calls make_predict_function.""" if model._compile_distribution: distributed_training_utils.clone_model_on_replicas( model, current_strategy, mode, inputs=inputs) else: distributed_training_utils._build_distributed_network( model, current_strategy, mode, inputs) (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args ) = current_strategy.extended.call_for_each_replica( _per_device_predict_function, args=(distributed_training_utils.get_distributed_model( model, ModeKeys.PREDICT), )) (all_inputs, all_outputs, all_updates, all_session_args) = distributed_training_utils.unwrap_values( current_strategy, grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args) combined_fn = K.function(all_inputs, all_outputs, updates=all_updates, name='distributed_predict_function', **all_session_args) for label, output in zip(model.output_names, combined_fn.outputs): ctx.set_last_step_output(label, output) return combined_fn.updates_op # Add initial dummy values for outputs. initial_loop_values = {} batch_dimension = distributed_training_utils.get_batch_dimension(iterator) for name, tensor in zip(model.output_names, model.outputs): # TODO(priyag): This is a workaround as we do not know the batch dimension # of the model's output at this point. shape = tensor_shape.TensorShape(tensor.shape.dims) shape.dims = [batch_dimension] + shape.dims[1:] initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype) # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed. ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=1, initial_loop_values=initial_loop_values) predict_op = ctx.run_op output_tensors = ctx.last_step_outputs if verbose == 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model( model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks(callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=mode) callbacks._call_begin_hook(mode) # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [[] for _ in model.outputs] if steps is not None: target_steps = steps else: target_steps = np.inf current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.get_session().run([predict_op, output_tensors]) except errors.OutOfRangeError: if steps is not None: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) else: warning_msg = 'Number of steps ran: {} steps'.format( current_step) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) break # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i, label in enumerate(model.output_names): unconcatenated_outs[i].extend(batch_outs[label]) batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose >= 1: progbar.update(current_step + 1) current_step += 1 callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def model_iteration(model, data, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None, validation_freq=1, class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=False, initial_epoch=0, mode=ModeKeys.TRAIN, batch_size=None, steps_name='steps', **kwargs): """Loop function for arrays of data with modes TRAIN/TEST/PREDICT. Arguments: model: Keras Model instance. data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or `(x, y, sample_weights)`) or a generator or `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset. steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. epochs: Number of times to iterate over the data. verbose: Verbosity mode, 0, 1 or 2. callbacks: List of callbacks to be called during training. validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or `(x, y, sample_weights)`) or a generator or `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset. validation_steps: Total number of steps (batches of samples) before declaring validation finished. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. class_weight: Dictionary mapping class indices to a weight for the class. max_queue_size: Integer. Maximum size for the generator queue. If unspecified, `max_queue_size` will default to 10. workers: Integer. Maximum number of processes to spin up when using process-based threading. If unspecified, `workers` will default to 1. If 0, will execute the generator on the main thread. use_multiprocessing: Boolean. If `True`, use process-based threading. If unspecified, `use_multiprocessing` will default to `False`. Note that because this implementation relies on multiprocessing, you should not pass non-picklable arguments to the generator as they can't be passed easily to children processes. shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch. Only used with instances of `Sequence` (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not `None`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run). mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT. batch_size: Integer batch size or None if unknown. Will only be used if `data` is in NumPy/Tensor format. steps_name: The string name of the steps argument, either `steps`, `validation_steps`, or `steps_per_epoch`. Only used for error message formatting. **kwargs: Additional arguments for backwards compatibility. `steps` is accepted as an alias for `steps_per_epoch`. Returns: - In TRAIN mode: `History` object. - In TEST mode: Evaluation metrics. - In PREDICT mode: Outputs of the Model called on inputs. Raises: ValueError: in case of invalid arguments. """ if 'steps' in kwargs: steps_per_epoch = kwargs['steps'] # Determine the number of steps per epoch and whether we should reset the # dataset at the end of each epoch. reset_dataset_after_each_epoch = False original_dataset = None is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)) if is_dataset: original_dataset = data if steps_per_epoch is None: reset_dataset_after_each_epoch = True steps_per_epoch = training_utils.infer_steps_for_dataset( data, steps_per_epoch, epochs=epochs, steps_name=steps_name) # Convert to a format that supports `next(generator)`. generator, steps_per_epoch = convert_to_generator_like( data, steps_per_epoch=steps_per_epoch, batch_size=batch_size, epochs=epochs - initial_epoch, shuffle=shuffle) do_validation = validation_data is not None is_sequence = isinstance(generator, data_utils.Sequence) _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers, steps_per_epoch, validation_data, validation_steps, mode, kwargs) batch_function = _make_execution_function( model, mode, class_weight=class_weight) # Create the queue for the generator. enqueuer = None if not is_dataset: generator, enqueuer = _make_enqueued_generator( generator, workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, shuffle=shuffle) num_samples_or_steps, use_steps = _get_num_samples_or_steps( data, steps_per_epoch) count_mode = 'steps' if use_steps else 'samples' callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, samples=num_samples_or_steps, verbose=0, # Handle ProgBar as part of Callbacks once hooks are ready. mode=mode) # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. progbar = training_utils.get_progbar(model, count_mode) progbar.params = callbacks.params progbar.params['verbose'] = verbose if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator(True, steps_per_epoch) else: aggregator = training_utils.MetricsAggregator(True, steps_per_epoch) should_set_learning_phase = context.executing_eagerly() and model.run_eagerly if should_set_learning_phase: old_learning_phase = backend.learning_phase() backend.set_eager_learning_phase(1 if mode == ModeKeys.TRAIN else 0) callbacks.model.stop_training = False callbacks._call_begin_hook(mode) progbar.on_train_begin() initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode) for epoch in range(initial_epoch, epochs): if callbacks.model.stop_training: break # Setup work for each epoch. model.reset_metrics() epoch_logs = {} if mode == ModeKeys.TRAIN: callbacks.on_epoch_begin(epoch, epoch_logs) progbar.on_epoch_begin(epoch, epoch_logs) if steps_per_epoch is None: # Loop over dataset until `OutOfRangeError` is raised. target_steps = np.inf else: # Loop over dataset for the specified number of steps. target_steps = steps_per_epoch step = 0 while step < target_steps: batch_data = _get_next_batch(generator, mode) if batch_data is None: if is_dataset: # The dataset passed by the user ran out of batches. # Now we know the cardinality of the dataset. # If steps_per_epoch was specified, then running out of data is # unexpected, so we stop training and inform the user. if steps_per_epoch: callbacks.model.stop_training = True logging.warning( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`%s * epochs` batches (in this case, %d batches). ' 'You may need to use the repeat() function when ' 'building your dataset.' % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step aggregator.num_samples_or_steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch else: # We ran out of batches while the user passed an iterator (legacy). callbacks.model.stop_training = True logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your iterator ' 'can generate at least `%s * epochs` ' 'batches (in this case, %d batches). You may need to' 'use the repeat() function when building your ' 'dataset.' % (steps_name, steps_per_epoch * epochs)) break # `batch_size` used for validation data if validation # data is NumPy/EagerTensors. batch_size = int(nest.flatten(batch_data)[0].shape[0]) # Callbacks batch begin. batch_logs = {'batch': step, 'size': batch_size} callbacks._call_batch_hook(mode, 'begin', step, batch_logs) progbar.on_batch_begin(step, batch_logs) is_deferred = not model._is_compiled batch_outs = batch_function(*batch_data) if not isinstance(batch_outs, list): batch_outs = [batch_outs] if step == 0: aggregator.create(batch_outs) if is_deferred: # Set callbacks params. We do this here when model is compiled only # in the first iteration of this loop (deferred build scenario). cbks.set_callback_parameters( callbacks, model, do_validation=do_validation, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, samples=num_samples_or_steps, verbose=verbose, mode=mode) progbar.params = callbacks.params progbar.params['verbose'] = verbose # Aggregate results. aggregator.aggregate(batch_outs) # Callbacks batch end. batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', step, batch_logs) progbar.on_batch_end(step, batch_logs) step += 1 if callbacks.model.stop_training: break aggregator.finalize() results = aggregator.results epoch_logs = cbks.make_logs(model, epoch_logs, results, mode) if len(results) == 1: results = results[0] # Run the test loop every epoch during training. if (do_validation and training_utils.should_run_validation(validation_freq, epoch) and not callbacks.model.stop_training): val_results = model_iteration( model, validation_data, steps_per_epoch=validation_steps, batch_size=batch_size, class_weight=class_weight, workers=workers, use_multiprocessing=use_multiprocessing, max_queue_size=max_queue_size, callbacks=callbacks, verbose=0, mode=ModeKeys.TEST, steps_name='validation_steps') if not isinstance(val_results, list): val_results = [val_results] epoch_logs = cbks.make_logs( model, epoch_logs, val_results, mode, prefix='val_') if mode == ModeKeys.TRAIN: # Epochs only apply to `fit`. callbacks.on_epoch_end(epoch, epoch_logs) progbar.on_epoch_end(epoch, epoch_logs) # Recreate dataset iterator for the next epoch. if reset_dataset_after_each_epoch and epoch < epochs - 1: generator = dataset_ops.make_one_shot_iterator(original_dataset) callbacks._call_end_hook(mode) if enqueuer is not None: enqueuer.stop() if should_set_learning_phase: backend.set_eager_learning_phase(old_learning_phase) if mode == ModeKeys.TRAIN: return model.history return results
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU DistributionStrategy. Arguments: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = distributed_training_utils.get_iterator(dataset, current_strategy) steps = training_utils.infer_steps_for_dataset(dataset, steps, steps_name='steps') scope = distributed_training_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() out_labels = model.metrics_names def _test_step_fn(inputs): """A fn that returns output of single test step.""" inputs, targets = inputs (distribution_strategy_context.get_replica_context().merge_call( _build_model, args=(model, mode, inputs, targets))) (_, outputs, updates, _) = ( _per_device_execution_function( distributed_training_utils.get_distributed_model(model, mode), mode)) with ops.control_dependencies([updates]): return outputs test_input_data = iterator.get_next() per_replica_outputs = current_strategy.experimental_run_v2( _test_step_fn, args=(test_input_data,)) output_tensors = {} for label, output in zip(out_labels, per_replica_outputs): if label == 'loss': reduce_op = ds_reduce_util.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = ds_reduce_util.ReduceOp.MEAN output_tensors[label] = current_strategy.reduce(reduce_op, output) test_op = control_flow_ops.group(list(output_tensors.values())) if verbose >= 1: progbar = Progbar(target=steps) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) distributed_training_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) outs = [0.] * len(model.metrics_names) if steps is not None: target_steps = steps else: raise ValueError('Number of steps could not be infered from the data, ' 'please pass the steps argument.') current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = K.batch_get_value([test_op, output_tensors]) except errors.OutOfRangeError: warning_msg = 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) target_steps = current_step break for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose == 1: progbar.update(current_step + 1) current_step += 1 if verbose >= 1: # Progress bar finishes at the end. progbar.update(target_steps) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (target_steps) if len(outs) == 1: return outs[0] return outs