def _read_config_files(self, run_paths): configs = {} config_fpaths = {} for run_name, logdir in run_paths.items(): config_fpath = os.path.join(logdir, PROJECTOR_FILENAME) if not file_io.file_exists(config_fpath): # Skip runs that have no config file. continue # Read the config file. file_content = file_io.read_file_to_string(config_fpath).decode('utf-8') config = ProjectorConfig() text_format.Merge(file_content, config) if not config.model_checkpoint_path: # See if you can find a checkpoint file in the logdir. ckpt_path = latest_checkpoint(logdir) if not ckpt_path: # Or in the parent of logdir. ckpt_path = latest_checkpoint(os.path.join('../', logdir)) if not ckpt_path: logging.warning('Cannot find model checkpoint in %s', logdir) continue config.model_checkpoint_path = ckpt_path # Sanity check for the checkpoint file. if not file_io.file_exists(config.model_checkpoint_path): logging.warning('Checkpoint file %s not found', config.model_checkpoint_path) continue configs[run_name] = config config_fpaths[run_name] = config_fpath return configs, config_fpaths
def testRecoverSession(self): # Create a checkpoint. checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with ops.Graph().as_default(): v = variables.Variable(1, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess, initialized = sm.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) sess.run(v.initializer) self.assertEquals(1, sess.run(v)) saver.save(sess, os.path.join(checkpoint_dir, "recover_session_checkpoint")) self._test_recovered_variable(checkpoint_dir=checkpoint_dir) self._test_recovered_variable( checkpoint_filename_with_path=saver_lib.latest_checkpoint( checkpoint_dir)) # Cannot set both checkpoint_dir and checkpoint_filename_with_path. with self.assertRaises(ValueError): self._test_recovered_variable( checkpoint_dir=checkpoint_dir, checkpoint_filename_with_path=saver_lib.latest_checkpoint( checkpoint_dir))
def _find_latest_checkpoint(dir_path): try: ckpt_path = latest_checkpoint(dir_path) if not ckpt_path: # Check the parent directory. ckpt_path = latest_checkpoint(os.path.join(dir_path, os.pardir)) return ckpt_path except errors.NotFoundError: return None
def testEvalOpAndFinalOp(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops') # Train a model for a single step to get a checkpoint. self._train_model(checkpoint_dir, num_steps=1) checkpoint_path = saver.latest_checkpoint(checkpoint_dir) # Create the model so we have something to restore. inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) logistic_classifier(inputs) num_evals = 5 final_increment = 9.0 my_var = local_variable(0.0, name='MyVar') eval_ops = state_ops.assign_add(my_var, 1.0) final_ops = array_ops.identity(my_var) + final_increment final_hooks = [evaluation._StopAfterNEvalsHook(num_evals),] initial_hooks = list(final_hooks) final_ops_values = evaluation._evaluate_once( checkpoint_path=checkpoint_path, eval_ops=eval_ops, final_ops={'value': final_ops}, hooks=final_hooks) self.assertEqual(final_ops_values['value'], num_evals + final_increment) self.assertEqual(initial_hooks, final_hooks)
def testEvaluateWithFiniteInputs(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'evaluate_with_finite_inputs') # Train a Model to completion: self._train_model(checkpoint_dir, num_steps=300) # Run evaluation. Inputs are fed through input producer for one epoch. all_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) all_labels = constant_op.constant(self._labels, dtype=dtypes.float32) single_input, single_label = training.slice_input_producer( [all_inputs, all_labels], num_epochs=1) inputs, labels = training.batch([single_input, single_label], batch_size=6, allow_smaller_final_batch=True) logits = logistic_classifier(inputs) predictions = math_ops.round(logits) accuracy, update_op = metrics.accuracy( predictions=predictions, labels=labels) checkpoint_path = saver.latest_checkpoint(checkpoint_dir) final_ops_values = evaluation._evaluate_once( checkpoint_path=checkpoint_path, eval_ops=update_op, final_ops={'accuracy': accuracy, 'eval_steps': evaluation._get_or_create_eval_step()}, hooks=[evaluation._StopAfterNEvalsHook(None),]) self.assertTrue(final_ops_values['accuracy'] > .99) # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs # each; the 3rd iter evaluates the remaining 4 inputs, and the last one # triggers an error which stops evaluation. self.assertEqual(final_ops_values['eval_steps'], 4)
def graph_def_from_checkpoint(checkpoint_dir, output_node_names): """Converts checkpoint data to GraphDef. Reads the latest checkpoint data and produces a GraphDef in which the variables have been converted to constants. Args: checkpoint_dir: Path to the checkpoints. output_node_names: List of name strings for the result nodes of the graph. Returns: A GraphDef from the latest checkpoint Raises: ValueError: if no checkpoint is found """ checkpoint_path = saver_lib.latest_checkpoint(checkpoint_dir) if checkpoint_path is None: raise ValueError('Could not find a checkpoint at: {0}.' .format(checkpoint_dir)) saver_for_restore = saver_lib.import_meta_graph( checkpoint_path + '.meta', clear_devices=True) with session.Session() as sess: saver_for_restore.restore(sess, checkpoint_path) graph_def = ops.get_default_graph().as_graph_def() output_graph_def = graph_util.convert_variables_to_constants( sess, graph_def, output_node_names) return output_graph_def
def export_fn(estimator, export_dir_base, checkpoint_path, eval_result=None): """Exports the given Estimator as a SavedModel. Args: estimator: the Estimator to export. export_dir_base: A string containing a directory to write the exported graph and checkpoints. checkpoint_path: The checkpoint path to export. If None (the default), the most recent checkpoint found within the model directory is chosen. eval_result: placehold args matching the call signature of ExportStrategy. Returns: The string path to the exported directory. """ if not checkpoint_path: # TODO(b/67425018): switch to # checkpoint_path = estimator.latest_checkpoint() # as soon as contrib is cleaned up and we can thus be sure that # estimator is a tf.estimator.Estimator and not a # tf.contrib.learn.Estimator checkpoint_path = saver.latest_checkpoint(estimator.model_dir) export_checkpoint_path, export_eval_result = best_model_selector.update( checkpoint_path, eval_result) if export_checkpoint_path and export_eval_result is not None: checkpoint_base = os.path.basename(export_checkpoint_path) export_dir = os.path.join(export_dir_base, checkpoint_base) return best_model_export_strategy.export( estimator, export_dir, export_checkpoint_path, export_eval_result) else: return ''
def _infer_model( self, input_fn, feed_fn=None, outputs=None, as_iterable=False): # Check that model has been trained. checkpoint_path = saver.latest_checkpoint(self._model_dir) if not checkpoint_path: raise NotFittedError("Couldn't find trained model at %s." % self._model_dir) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) contrib_framework.create_global_step(g) features = self._get_features_from_input_fn(input_fn) predictions = self._get_predict_ops(features) # If predictions is single output - wrap it into dict, and remember to # return not a dict. return_dict = isinstance(predictions, dict) if not return_dict: predictions = {'predictions': predictions} # Filter what to run predictions on, if outputs provided. if outputs: existing_keys = predictions.keys() predictions = { key: value for key, value in predictions.items() if key in outputs } if not predictions: raise ValueError('Expected to run at least one output from %s, ' 'provided %s.' % (existing_keys, outputs)) if as_iterable: return self._infer_model_as_iterable( checkpoint_path, predictions, feed_fn, return_dict) else: return self._infer_model_single( checkpoint_path, predictions, feed_fn, return_dict)
def test_recovery(self): logdir = _test_dir(self.get_temp_dir(), 'test_recovery') with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() do_step = state_ops.assign_add(gstep, 1) scaffold = monitored_session.Scaffold() # Use a hook to save the model every 100 steps. It also saves it at # the end. hooks = [ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, scaffold=scaffold) ] with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_dir=logdir), hooks=hooks) as session: self.assertEqual(0, session.run(gstep)) self.assertEqual(1, session.run(do_step)) self.assertEqual(2, session.run(do_step)) # A restart will find the checkpoint and recover automatically. with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_dir=logdir)) as session: self.assertEqual(2, session.run(gstep)) # A restart will find the checkpoint and recover automatically. with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_filename_with_path=saver_lib.latest_checkpoint( logdir))) as session: self.assertEqual(2, session.run(gstep))
def export_estimator(estimator, export_dir, input_fn=_default_input_fn, signature_fn=_generic_signature_fn, default_batch_size=1, exports_to_keep=None): """Exports inference graph into given dir. Args: estimator: Estimator to export export_dir: A string containing a directory to write the exported graph and checkpoints. input_fn: Function that given `Tensor` of `Example` strings, parses it into features that are then passed to the model. signature_fn: Function that given `Tensor` of `Example` strings, `dict` of `Tensor`s for features and `dict` of `Tensor`s for predictions and returns default and named exporting signautres. default_batch_size: Default batch size of the `Example` placeholder. exports_to_keep: Number of exports to keep. """ checkpoint_path = tf_saver.latest_checkpoint(estimator._model_dir) with ops.Graph().as_default() as g: contrib_variables.create_global_step(g) examples = array_ops.placeholder(dtype=dtypes.string, shape=[default_batch_size], name='input_example_tensor') features = input_fn(estimator, examples) predictions = estimator._get_predict_ops(features) default_signature, named_graph_signatures = signature_fn( examples, features, predictions) if exports_to_keep is not None: exports_to_keep = gc.largest_export_versions(exports_to_keep) _export_graph(g, _get_saver(), checkpoint_path, export_dir, default_graph_signature=default_signature, named_graph_signatures=named_graph_signatures, exports_to_keep=exports_to_keep)
def create_session(self, checkpoint_dir): """Creates a MonitoredSession for this predictor.""" checkpoint_path = saver.latest_checkpoint(checkpoint_dir) return training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=self._session_config()))
def every_n_step_end(self, step, outputs): super(ValidationMonitor, self).every_n_step_end(step, outputs) # TODO(mdan): The use of step below is probably misleading. # The code should probably use the step from the checkpoint, because # that's what is being evaluated. if self._estimator is None: raise ValueError("Missing call to set_estimator.") current_time = time.time() if (self._check_interval_secs is not None and self._last_checkpoint_check_time is not None and current_time - self._last_checkpoint_check_time <= self._check_interval_secs): logging.debug( "Skipping evaluation since less than %d seconds have passed since " "last check for a new checkpoint.", self._check_interval_secs) return False self._last_checkpoint_check_time = current_time # Check that we are not running evaluation on the same checkpoint. latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir) if latest_path is None: logging.debug("Skipping evaluation since model has not been saved yet " "at step %d.", step) return False if latest_path is not None and latest_path == self._latest_path: logging.debug("Skipping evaluation due to same checkpoint %s for step %d " "as for step %d.", latest_path, step, self._latest_path_step) return False self._latest_path = latest_path self._latest_path_step = step # Run evaluation and log it. validation_outputs = self._evaluate_estimator() stats = [] for name in validation_outputs: stats.append("%s = %s" % (name, str(validation_outputs[name]))) logging.info("Validation (step %d): %s", step, ", ".join(stats)) # Early stopping logic. if self.early_stopping_rounds is not None: if self.early_stopping_metric not in validation_outputs: raise ValueError("Metric %s missing from outputs %s." % (self.early_stopping_metric, set(validation_outputs.keys()))) current_value = validation_outputs[self.early_stopping_metric] if (self._best_value is None or (self.early_stopping_metric_minimize and (current_value < self._best_value)) or (not self.early_stopping_metric_minimize and (current_value > self._best_value))): self._best_value = current_value self._best_metrics = copy.deepcopy(validation_outputs) self._best_value_step = step stop_now = (step - self._best_value_step >= self.early_stopping_rounds) if stop_now: logging.info("Stopping. Best step: {} with {} = {}.".format( self._best_value_step, self.early_stopping_metric, self._best_value)) self._early_stopped = True return True return False
def testMultiEvalStepIncrements(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops') # Train a model for a single step to get a checkpoint. self._train_model(checkpoint_dir, num_steps=1) checkpoint_path = saver.latest_checkpoint(checkpoint_dir) # Create the model so we have something to restore. inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) logistic_classifier(inputs) num_evals = 6 my_var = local_variable(0.0, name='MyVar') # In eval ops, we also increase the eval step one more time. eval_ops = [state_ops.assign_add(my_var, 1.0), state_ops.assign_add( evaluation._get_or_create_eval_step(), 1, use_locking=True)] expect_eval_update_counts = num_evals // 2 final_ops = array_ops.identity(my_var) final_ops_values = evaluation._evaluate_once( checkpoint_path=checkpoint_path, eval_ops=eval_ops, final_ops={'value': final_ops}, hooks=[evaluation._StopAfterNEvalsHook(num_evals),]) self.assertEqual(final_ops_values['value'], expect_eval_update_counts)
def testUsageGraph(self): """Expected usage when graph building.""" with context.graph_mode(): num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") for training_continuation in range(3): with ops.Graph().as_default(): network = MyNetwork() optimizer = adam.AdamOptimizer(0.001) root = checkpointable_utils.Checkpoint( optimizer=optimizer, network=network, global_step=training_util.get_or_create_global_step()) input_value = constant_op.constant([[3.]]) train_op = optimizer.minimize( network(input_value), global_step=root.global_step) checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory) with self.test_session(graph=ops.get_default_graph()) as session: status = root.restore(save_path=checkpoint_path) status.initialize_or_restore(session=session) if checkpoint_path is None: self.assertEqual(0, training_continuation) with self.assertRaises(AssertionError): status.assert_consumed() else: status.assert_consumed() for _ in range(num_training_steps): session.run(train_op) root.save(file_prefix=checkpoint_prefix, session=session) self.assertEqual((training_continuation + 1) * num_training_steps, session.run(root.global_step)) self.assertEqual(training_continuation + 1, session.run(root.save_counter))
def testAgnosticUsage(self): """Graph/eager agnostic usage.""" # Does create garbage when executing eagerly due to ops.Graph() creation. num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") for training_continuation in range(3): with ops.Graph().as_default(), self.test_session( graph=ops.get_default_graph()): network = MyNetwork() optimizer = adam.AdamOptimizer(0.001) root = checkpointable_utils.Checkpoint( optimizer=optimizer, network=network, global_step=training_util.get_or_create_global_step()) checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory) status = root.restore(save_path=checkpoint_path) input_value = constant_op.constant([[3.]]) train_fn = functools.partial( optimizer.minimize, functools.partial(network, input_value), global_step=root.global_step) if context.in_graph_mode(): train_fn = functools.partial(self.evaluate, train_fn()) status.initialize_or_restore() for _ in range(num_training_steps): train_fn() root.save(file_prefix=checkpoint_prefix) self.assertEqual((training_continuation + 1) * num_training_steps, self.evaluate(root.global_step)) self.assertEqual(training_continuation + 1, self.evaluate(root.save_counter))
def _infer_model(self, x=None, input_fn=None, feed_fn=None, batch_size=None): # Converts inputs into tf.DataFrame / tf.Series. batch_size = -1 if batch_size is None else batch_size if x is not None: input_fn, feed_fn = _get_predict_input_fn(x, None, batch_size) checkpoint_path = saver.latest_checkpoint(self._model_dir) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) contrib_framework.create_global_step(g) features = self._get_features_from_input_fn(input_fn) predictions = self._get_predict_ops(features) return_dict = True if not isinstance(predictions, dict): predictions, return_dict = {'predictions': predictions}, False if feed_fn is None: preds = infer(checkpoint_path, predictions) else: preds = {} def _feed_fn(): while True: yield feed_fn() outputs = graph_actions.run_feeds( output_dict=predictions, feed_dicts=_feed_fn(), restore_checkpoint_path=checkpoint_path) for key in predictions: preds[key] = np.concatenate( [output[key] for output in outputs], axis=0) if return_dict: return preds return preds['predictions']
def create_session(checkpoint_path, n_cpu_threads=-1): """Creates a MonitoredSession. Args: checkpoint_path (string): Path either to checkpoint directory or directly to a checkpoint file. n_cpu_threads (int): Number of CPU threads. If negative, we assume either GPU decoding or that all CPU cores can be used. Returns: A TensorFlow MonitoredSession. """ try: if os.path.isdir(checkpoint_path): checkpoint_path = saver.latest_checkpoint(checkpoint_path) else: logging.info("%s is not a directory. Interpreting as direct " "path to checkpoint..." % checkpoint_path) return training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=session_config(n_cpu_threads))) except tf.errors.NotFoundError as e: logging.fatal("Could not find all variables of the computation " "graph in the T2T checkpoint file. This means that the " "checkpoint does not correspond to the model specified in " "SGNMT. Please double-check pred_src_vocab_size, " "pred_trg_vocab_size, and all the t2t_* parameters. " "Also make sure that the checkpoint exists and is readable") raise AttributeError("Could not initialize TF session.")
def _evaluate_model(self, input_fn, steps, feed_fn=None, metrics=None, name=''): if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'): return checkpoint_path = saver.latest_checkpoint(self._model_dir) eval_dir = os.path.join(self._model_dir, 'eval' if not name else 'eval_' + name) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) eval_dict = self._get_eval_ops(features, targets, metrics if metrics is not None else self._get_default_metric_functions()) update_op, eval_dict = self._extract_metric_update_ops(eval_dict) eval_results, _ = evaluate(graph=g, output_dir=eval_dir, checkpoint_path=checkpoint_path, eval_dict=eval_dict, update_op=update_op, global_step_tensor=global_step, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps) return eval_results
def _infer_model(self, x=None, input_fn=None, feed_fn=None, batch_size=None, axis=None, proba=False): # Converts inputs into tf.DataFrame / tf.Series. batch_size = -1 if batch_size is None else batch_size if x is not None: input_fn, feed_fn = _get_predict_input_fn(x, batch_size) checkpoint_path = saver.latest_checkpoint(self._model_dir) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) contrib_framework.create_global_step(g) features, _ = input_fn() predictions = self._get_predict_ops(features) if not isinstance(predictions, dict): predictions = {'predictions': predictions} # TODO(ipolosukhin): Support batching if feed_fn is None: return infer(checkpoint_path, predictions) preds = {} while True: try: feed_dict = feed_fn() except StopIteration: break if feed_dict is None: break outputs = infer(checkpoint_path, predictions, feed_dict=feed_dict) for key in outputs: if key not in preds: preds[key] = [] preds[key].append(outputs[key]) for key in preds: preds[key] = np.concatenate(preds[key], axis=0) return preds
def testDeferredRestorationUsageEager(self): """An idiomatic eager execution example.""" num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") latest_object_graph = None # Will be saved with the checkpoint eventually. for training_continuation in range(3): with ops.Graph().as_default(): network = MyNetwork() optimizer = CheckpointableAdam(0.001) root = Root(optimizer=optimizer, network=network) checkpointable.restore( save_path=core_saver.latest_checkpoint(checkpoint_directory), root_checkpointable=root, object_graph_proto=latest_object_graph) for _ in range(num_training_steps): # TODO(allenl): Use a Dataset and serialize/checkpoint it. input_value = constant_op.constant([[3.]]) optimizer.minimize( lambda: network(input_value), # pylint: disable=cell-var-from-loop global_step=root.global_step) latest_object_graph, _ = checkpointable.save( file_prefix=checkpoint_prefix, root_checkpointable=root) self.assertEqual((training_continuation + 1) * num_training_steps, root.global_step.numpy())
def _restore_or_save_initial_ckpt(self, session): # Ideally this should be run in after_create_session but is not for the # following reason: # Currently there is no way of enforcing an order of running the # `SessionRunHooks`. Hence it is possible that the `_DatasetInitializerHook` # is run *after* this hook. That is troublesome because # 1. If a checkpoint exists and this hook restores it, the initializer hook # will override it. # 2. If no checkpoint exists, this hook will try to save an initialized # iterator which will result in an exception. # # As a temporary fix we enter the following implicit contract between this # hook and the _DatasetInitializerHook. # 1. The _DatasetInitializerHook initializes the iterator in the call to # after_create_session. # 2. This hook saves the iterator on the first call to `before_run()`, which # is guaranteed to happen after `after_create_session()` of all hooks # have been run. # Check if there is an existing checkpoint. If so, restore from it. # pylint: disable=protected-access latest_checkpoint_path = saver_lib.latest_checkpoint( self._checkpoint_saver_hook._checkpoint_dir, latest_filename=self._latest_filename) if latest_checkpoint_path: self._checkpoint_saver_hook._get_saver().restore(session, latest_checkpoint_path) else: # The checkpoint saved here is the state at step "global_step". # Note: We do not save the GraphDef or MetaGraphDef here. global_step = session.run(self._checkpoint_saver_hook._global_step_tensor) self._checkpoint_saver_hook._save(session, global_step) self._checkpoint_saver_hook._timer.update_last_triggered_step(global_step)
def predict(self, input_fn, predict_keys=None, hooks=None, checkpoint_path=None): """Returns predictions for given features. Args: input_fn: Input function returning features which is a dictionary of string feature name to `Tensor` or `SparseTensor`. If it returns a tuple, first item is extracted as features. Prediction continues until `input_fn` raises an end-of-input exception (`OutOfRangeError` or `StopIteration`). predict_keys: list of `str`, name of the keys to predict. It is used if the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used then rest of the predictions will be filtered from the dictionary. If `None`, returns all. hooks: List of `SessionRunHook` subclass instances. Used for callbacks inside the prediction call. checkpoint_path: Path of a specific checkpoint to predict. If `None`, the latest checkpoint in `model_dir` is used. Yields: Evaluated values of `predictions` tensors. Raises: ValueError: Could not find a trained model in model_dir. ValueError: if batch length of predictions are not same. ValueError: If there is a conflict between `predict_keys` and `predictions`. For example if `predict_keys` is not `None` but `EstimatorSpec.predictions` is not a `dict`. """ hooks = _check_hooks_type(hooks) # Check that model has been trained. if not checkpoint_path: checkpoint_path = saver.latest_checkpoint(self._model_dir) if not checkpoint_path: raise ValueError('Could not find trained model in model_dir: {}.'.format( self._model_dir)) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) training.create_global_step(g) features = self._get_features_from_input_fn(input_fn) estimator_spec = self._call_model_fn(features, None, model_fn_lib.ModeKeys.PREDICT) predictions = self._extract_keys(estimator_spec.predictions, predict_keys) with training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, scaffold=estimator_spec.scaffold, config=config_pb2.ConfigProto(allow_soft_placement=True)), hooks=hooks) as mon_sess: while not mon_sess.should_stop(): preds_evaluated = mon_sess.run(predictions) if not isinstance(predictions, dict): for pred in preds_evaluated: yield pred else: for i in range(self._extract_batch_length(preds_evaluated)): yield { key: value[i] for key, value in six.iteritems(preds_evaluated) }
def _save_first_checkpoint(keras_model, estimator, custom_objects, keras_weights): """Save first checkpoint for the keras Estimator. Args: keras_model: an instance of compiled keras model. estimator: keras estimator. custom_objects: Dictionary for custom objects. keras_weights: A flat list of Numpy arrays for weights of given keras_model. Returns: The model_fn for a keras Estimator. """ # Load weights and save to checkpoint if there is no checkpoint latest_path = saver_lib.latest_checkpoint(estimator.model_dir) if not latest_path: with ops.Graph().as_default(): random_seed.set_random_seed(estimator.config.tf_random_seed) training_util.create_global_step() model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model, custom_objects) # save to checkpoint with session.Session(config=estimator._session_config) as sess: if keras_weights: model.set_weights(keras_weights) # Make update ops and initialize all variables. if not model.train_function: # pylint: disable=protected-access model._make_train_function() K._initialize_variables(sess) # pylint: enable=protected-access saver = saver_lib.Saver() saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
def wait_for_new_checkpoint(checkpoint_dir, last_checkpoint, seconds_to_sleep=1, timeout=None): """Waits until a new checkpoint file is found. Args: checkpoint_dir: The directory in which checkpoints are saved. last_checkpoint: The last checkpoint path used. seconds_to_sleep: The number of seconds to sleep for before looking for a new checkpoint. timeout: The maximum amount of time to wait. If left as `None`, then the process will wait indefinitely. Returns: a new checkpoint path, or None if the timeout was reached. """ logging.info('Waiting for new checkpoint at %s', checkpoint_dir) stop_time = time.time() + timeout if timeout is not None else None while True: checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) if checkpoint_path is None or checkpoint_path == last_checkpoint: if stop_time is not None and time.time() + seconds_to_sleep > stop_time: return None time.sleep(seconds_to_sleep) else: logging.info('Found new checkpoint at %s', checkpoint_path) return checkpoint_path
def latest_checkpoint(self): """Finds the filename of latest saved checkpoint file in `model_dir`. Returns: The full path to the latest checkpoint or `None` if no checkpoint was found. """ return saver.latest_checkpoint(self.model_dir)
def _evaluate_model(self, input_fn, hooks=None, checkpoint_path=None, name=''): """Evaluates the model using the training.evaluation library.""" # Check that model has been trained (if nothing has been set explicitly). if not checkpoint_path: latest_path = saver.latest_checkpoint(self._model_dir) if not latest_path: raise ValueError('Could not find trained model in model_dir: {}.'. format(self._model_dir)) checkpoint_path = latest_path # Setup output directory. eval_dir = os.path.join(self._model_dir, 'eval' if not name else 'eval_' + name) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) global_step_tensor = self._create_and_assert_global_step(g) features, labels = self._get_features_and_labels_from_input_fn( input_fn, model_fn_lib.ModeKeys.EVAL) estimator_spec = self._call_model_fn( features, labels, model_fn_lib.ModeKeys.EVAL) if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops: raise ValueError( 'Metric with name "%s" is not allowed, because Estimator ' % ( model_fn_lib.LOSS_METRIC_KEY) + 'already defines a default metric with the same name.') estimator_spec.eval_metric_ops[ model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss) update_op, eval_dict = _extract_metric_update_ops( estimator_spec.eval_metric_ops) if ops.GraphKeys.GLOBAL_STEP in eval_dict: raise ValueError( 'Metric with name `global_step` is not allowed, because Estimator ' 'already defines a default metric with the same name.') eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor eval_results = evaluation._evaluate_once( # pylint: disable=protected-access checkpoint_path=checkpoint_path, master=self._config.evaluation_master, scaffold=estimator_spec.scaffold, eval_ops=update_op, final_ops=eval_dict, hooks=hooks, config=self._session_config) _write_dict_to_summary( output_dir=eval_dir, dictionary=eval_results, current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP]) return eval_results
def _export_estimator(estimator, export_dir, signature_fn, input_fn, default_batch_size, exports_to_keep): input_fn = input_fn or _default_input_fn checkpoint_path = tf_saver.latest_checkpoint(estimator._model_dir) with ops.Graph().as_default() as g: contrib_variables.create_global_step(g) examples = array_ops.placeholder(dtype=dtypes.string, shape=[default_batch_size], name='input_example_tensor') features = input_fn(estimator, examples) predictions = estimator._get_predict_ops(features) # Explicit signature_fn takes priority if signature_fn: default_signature, named_graph_signatures = signature_fn(examples, features, predictions) else: try: # Some estimators provide a target_column of known type target_column = estimator._get_target_column() problem_type = target_column.problem_type if problem_type == layers.ProblemType.CLASSIFICATION: signature_fn = classification_signature_fn elif problem_type == layers.ProblemType.LINEAR_REGRESSION: signature_fn = regression_signature_fn elif problem_type == layers.ProblemType.LOGISTIC_REGRESSION: signature_fn = logistic_regression_signature_fn else: raise ValueError( 'signature_fn must be provided because the TargetColumn is a %s, ' 'which does not have a standard problem type and so cannot use a ' 'standard export signature.' % type(target_column).__name__) default_signature, named_graph_signatures = ( signature_fn(examples, features, predictions)) except AttributeError: logging.warn( 'Change warning: `signature_fn` will be required after' '2016-08-01.\n' 'Using generic signatures for now. To maintain this behavior, ' 'pass:\n' ' signature_fn=export.generic_signature_fn\n' 'Also consider passing a regression or classification signature; ' 'see cl/126430915 for an example.') default_signature, named_graph_signatures = generic_signature_fn( examples, features, predictions) if exports_to_keep is not None: exports_to_keep = gc.largest_export_versions(exports_to_keep) _export_graph(g, _get_saver(), checkpoint_path, export_dir, default_graph_signature=default_signature, named_graph_signatures=named_graph_signatures, exports_to_keep=exports_to_keep)
def _read_vars(self, model_dir): """Returns (global_step, latest_feature).""" with ops.Graph().as_default() as g: ckpt_path = saver_lib.latest_checkpoint(model_dir) meta_filename = ckpt_path + '.meta' saver_lib.import_meta_graph(meta_filename) saver = saver_lib.Saver() with self.test_session(graph=g) as sess: saver.restore(sess, ckpt_path) return sess.run(ops.get_collection('my_vars'))
def _latest_checkpoints_changed(configs, run_path_pairs): """Returns true if the latest checkpoint has changed in any of the runs.""" for run_name, logdir in run_path_pairs: if run_name not in configs: continue config = configs[run_name] if not config.model_checkpoint_path: continue # See if you can find a checkpoint file in the logdir. ckpt_path = latest_checkpoint(logdir) if not ckpt_path: # See if you can find a checkpoint in the parent of logdir. ckpt_path = latest_checkpoint(os.path.join(logdir, os.pardir)) if not ckpt_path: continue if config.model_checkpoint_path != ckpt_path: return True return False
def _read_config_files(self, run_paths, logdir): # If there are no summary event files, the projector can still work, # thus treating the `logdir` as the model checkpoint directory. if not run_paths: run_paths['.'] = logdir configs = {} config_fpaths = {} for run_name, logdir in run_paths.items(): config = ProjectorConfig() config_fpath = os.path.join(logdir, PROJECTOR_FILENAME) if file_io.file_exists(config_fpath): file_content = file_io.read_file_to_string(config_fpath).decode('utf-8') text_format.Merge(file_content, config) has_tensor_files = False for embedding in config.embeddings: if embedding.tensor_path: has_tensor_files = True break if not config.model_checkpoint_path: # See if you can find a checkpoint file in the logdir. ckpt_path = latest_checkpoint(logdir) if not ckpt_path: # Or in the parent of logdir. ckpt_path = latest_checkpoint(os.path.join('../', logdir)) if not ckpt_path and not has_tensor_files: logging.warning('Cannot find model checkpoint in %s', logdir) continue if ckpt_path: config.model_checkpoint_path = ckpt_path # Sanity check for the checkpoint file. if (config.model_checkpoint_path and not checkpoint_exists(config.model_checkpoint_path)): logging.warning('Checkpoint file %s not found', config.model_checkpoint_path) continue configs[run_name] = config config_fpaths[run_name] = config_fpath return configs, config_fpaths
def end(self, session=None): super(ExportMonitor, self).end(session=session) latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir) if latest_path is None: logging.info( "Skipping export at the end since model has not been saved " "yet.") return try: self._last_export_dir = self._estimator.export( self.export_dir, exports_to_keep=self.exports_to_keep, signature_fn=self.signature_fn, input_fn=self._input_fn, default_batch_size=self._default_batch_size, input_feature_key=self._input_feature_key, use_deprecated_input_fn=self._use_deprecated_input_fn) except RuntimeError: logging.info("Skipping exporting for the same step.")
def _infer_model(self, x=None, input_fn=None, feed_fn=None, batch_size=None): # Converts inputs into tf.DataFrame / tf.Series. batch_size = -1 if batch_size is None else batch_size if x is not None: input_fn, feed_fn = _get_predict_input_fn(x, None, batch_size) checkpoint_path = saver.latest_checkpoint(self._model_dir) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) contrib_framework.create_global_step(g) features = self._get_features_from_input_fn(input_fn) predictions = self._get_predict_ops(features) return_dict = True if not isinstance(predictions, dict): predictions, return_dict = {'predictions': predictions}, False if feed_fn is None: preds = infer(checkpoint_path, predictions) else: preds = {} while True: try: feed_dict = feed_fn() except StopIteration: break if feed_dict is None: break outputs = infer(checkpoint_path, predictions, feed_dict=feed_dict) for key in outputs: if key not in preds: preds[key] = [] preds[key].append(outputs[key]) for key in preds: preds[key] = np.concatenate(preds[key], axis=0) if return_dict: return preds return preds['predictions']
def inference_run(model, hparams, output_dir): # Build Model tf.logging.info("Build Model...") model_fn_inference = model_builder_inference(model, hparams=hparams) # Build Graph tf.logging.info("Build Graph...") checkpoint_path = saver.latest_checkpoint(output_dir) if not checkpoint_path: raise NotFittedError("Couldn't find trained model at %s." % output_dir) with ops.Graph().as_default() as g: tf.train.create_global_step(g) inputs_ph = tf.placeholder(tf.int32, [None, None]) ## batch_size features = {"inputs": inputs_ph} labels = None infer_ops = model_fn_inference(features, labels) # predictions, None, None predictions = infer_ops[0] mon_sess = tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=session_config( gpu_mem_fraction=FLAGS.gpu_mem_fraction))) def predict_func(feed_fn=None): with ops.Graph().as_default() as g: inputs = feed_fn["inputs"] feed = {inputs_ph: inputs} preds = mon_sess.run(predictions, feed) first_tensor = list(preds.values())[0] batch_length = first_tensor.shape[0] for i in range(batch_length): yield {key: value[i] for key, value in six.iteritems(preds)} tf.logging.info("Begin Decoding...") inference.decode_from_file(predict_func, hparams, FLAGS.decode_from_file, FLAGS.decode_to_file, FLAGS.decode_batch_size, FLAGS.decode_beam_size, FLAGS.decode_return_beams)
def predict(output_path, separator=",", mode="w+"): print("Setting up inference subgraph") predict_input = tf.placeholder(dtype=tf.float32, shape=[None, WINDOW_SIZE, CHANNELS]) batch_logits = inference(predict_input, is_training=False) predicted_probabilities = tf.nn.sigmoid(batch_logits) mean_prediction = tf.reduce_mean(predicted_probabilities) print("Restoring model from training with best validation accuracy") sess = tf.Session() saver = tf.train.Saver() checkpoint_file = latest_checkpoint(MODEL_DIR) print("Restoring the model from a checkpoint:\t%s" % checkpoint_file) saver.restore(sess, checkpoint_file) print("Predicting") with open(output_path, mode=mode) as file_stream: print("File", "Class", file=file_stream, sep=separator) for segment, file_name in generate_test_segment(DATA_ROOT, "test"): predicted_probability = sess.run(mean_prediction, feed_dict={predict_input: segment, keep_prob: 1.}) print(file_name, predicted_probability, sep=separator, file=file_stream)
def create_session(): """Creates a MonitoredSession for this predictor.""" if not FLAGS.checkpoint_path: raise AttributeError("Please set --checkpoint_path") try: if os.path.isdir(FLAGS.checkpoint_path): checkpoint_path = saver.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info("%s is not a directory. Interpreting as direct " "path to checkpoint..." % checkpoint_path) return training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=session_config())) except tf.errors.NotFoundError as e: tf.logging.fatal("Could not find all variables of the computation " "graph in the T2T checkpoint file. This means that the " "checkpoint does not correspond to the specified model") raise AttributeError("Could not initialize TF session.")
def wait_for_new_checkpoint(checkpoint_dir, last_checkpoint, seconds_to_sleep=1): """Waits until a new checkpoint file is found. Args: checkpoint_dir: The directory in which checkpoints are saved. last_checkpoint: The last checkpoint path used. seconds_to_sleep: The number of seconds to sleep for before looking for a new checkpoint. Returns: a new checkpoint path. """ while True: checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) if checkpoint_path == last_checkpoint: time.sleep(seconds_to_sleep) else: return checkpoint_path
def captcha2text(image_list, height=CAPTCHA_HEIGHT, width=CAPTCHA_WIDTH): if not isdir('./model'): print('Model directory does not exists.') return x = placeholder(float32, [None, height * width]) keep_prob = placeholder(float32) y_conv = cnn_graph(x, keep_prob, (height, width)) saver = Saver() with Session() as sess: saver.restore(sess, latest_checkpoint('./model/')) predict = argmax(reshape( y_conv, [-1, CAPTCHA_LEN, len(CAPTCHA_LIST)]), 2) vector_list = sess.run(predict, feed_dict={ x: image_list, keep_prob: 1 }) vector_list = vector_list.tolist() text_list = [vec2text(vector) for vector in vector_list] return text_list
def every_n_step_end(self, step, unused_outputs): # Check that we are not running evaluation on the same checkpoint. latest_path = saver.latest_checkpoint(self._estimator.model_dir) if latest_path == self._latest_path: logging.info( "Skipping evaluation due to same checkpoint %s for step %d " "as for step %d.", latest_path, step, self._latest_path_step) return False self._latest_path = latest_path self._latest_path_step = step # Run evaluation and log it. outputs = self._estimator.evaluate(x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size, metrics=self.metrics, name=self.name) stats = [] for name in outputs: stats.append("%s = %s" % (name, str(outputs[name]))) logging.info("Validation (step %d): %s" % (step, ", ".join(stats))) # Early stopping logic. if self.early_stopping_rounds is not None: if (self._best_value is None or (self.early_stopping_metric_minimize and outputs[self.early_stopping_metric] < self._best_value) or (not self.early_stopping_metric_minimize and outputs[self.early_stopping_metric] > self._best_value)): self._best_value = outputs[self.early_stopping_metric] self._best_value_step = step stop_now = (step - self._best_value_step >= self.early_stopping_rounds) if stop_now: logging.info("Stopping. Best step: {} with {} = {}.".format( self._best_value_step, self.early_stopping_metric, self._best_value)) self._early_stopped = True return True return False
def testUsageGraph(self): """Expected usage when graph building.""" with context.graph_mode(): num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") latest_object_graph = None for training_continuation in range(3): with ops.Graph().as_default(): network = MyNetwork() optimizer = CheckpointableAdam(0.001) root = Root(optimizer=optimizer, network=network) input_value = constant_op.constant([[3.]]) train_op = optimizer.minimize(network(input_value), global_step=root.global_step) init_op = variables.global_variables_initializer() checkpoint_path = core_saver.latest_checkpoint( checkpoint_directory) with self.test_session( graph=ops.get_default_graph()) as session: if checkpoint_path is None: self.assertEqual(0, training_continuation) session.run(init_op) # Another alternative would be to run initializers automatically # if no checkpoint is being loaded. This would make deferred # loading a bit more useful with graph execution. else: checkpointable.restore( save_path=checkpoint_path, root_checkpointable=root, object_graph_proto=latest_object_graph, session=session) for _ in range(num_training_steps): session.run(train_op) latest_object_graph, _ = checkpointable.save( file_prefix=checkpoint_prefix, root_checkpointable=root, session=session) self.assertEqual( (training_continuation + 1) * num_training_steps, session.run(root.global_step))
def _evaluate_model(self, input_fn, steps, feed_fn=None, metrics=None, name=''): # TODO(wicke): Remove this once Model and associated code are gone. if (hasattr(self._config, 'execution_mode') and self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset')): return None, None # Check that model has been trained. checkpoint_path = self._model_dir latest_path = saver.latest_checkpoint(checkpoint_path) if not latest_path: raise NotFittedError("Couldn't find trained model at %s." % checkpoint_path) # Setup output directory. eval_dir = os.path.join(self._model_dir, 'eval' if not name else 'eval_' + name) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) eval_dict = self._get_eval_ops(features, targets, metrics) update_op, eval_dict = self._extract_metric_update_ops(eval_dict) eval_results, current_global_step = graph_actions.evaluate( graph=g, output_dir=eval_dir, checkpoint_path=checkpoint_path, eval_dict=eval_dict, update_op=update_op, global_step_tensor=global_step, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps) return eval_results, current_global_step
def _infer_model(self, input_fn, feed_fn=None, outputs=None, as_iterable=False): # Check that model has been trained. checkpoint_path = saver.latest_checkpoint(self._model_dir) if not checkpoint_path: raise NotFittedError("Couldn't find trained model at %s." % self._model_dir) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) contrib_framework.create_global_step(g) features = self._get_features_from_input_fn(input_fn) predictions = self._get_predict_ops(features) # If predictions is single output - wrap it into dict, and remember to # return not a dict. return_dict = isinstance(predictions, dict) if not return_dict: predictions = {'predictions': predictions} # Filter what to run predictions on, if outputs provided. if outputs: existing_keys = predictions.keys() predictions = { key: value for key, value in predictions.items() if key in outputs } if not predictions: raise ValueError( 'Expected to run at least one output from %s, ' 'provided %s.' % (existing_keys, outputs)) if as_iterable: return self._infer_model_as_iterable(checkpoint_path, predictions, feed_fn, return_dict) else: return self._infer_model_single(checkpoint_path, predictions, feed_fn, return_dict)
def testDeferredRestorationUsageEager(self): """An idiomatic eager execution example.""" num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") for training_continuation in range(3): network = MyNetwork() optimizer = adam.AdamOptimizer(0.001) root = checkpointable_utils.Checkpoint( optimizer=optimizer, network=network, optimizer_step=training_util.get_or_create_global_step()) root.restore(core_saver.latest_checkpoint(checkpoint_directory)) for _ in range(num_training_steps): # TODO(allenl): Use a Dataset and serialize/checkpoint it. input_value = constant_op.constant([[3.]]) optimizer.minimize( lambda: network(input_value), # pylint: disable=cell-var-from-loop global_step=root.optimizer_step) root.save(file_prefix=checkpoint_prefix) self.assertEqual((training_continuation + 1) * num_training_steps, root.optimizer_step.numpy())
def create_session(self): """Creates a MonitoredSession for this predictor.""" try: if os.path.isdir(self._checkpoint_dir): checkpoint_path = saver.latest_checkpoint(self._checkpoint_dir) else: checkpoint_path = self._checkpoint_dir logging.info("%s is not a directory. Interpreting as direct " "path to checkpoint..." % checkpoint_path) return training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=self._session_config())) except tf.errors.NotFoundError as e: logging.fatal( "Could not find all variables of the computation " "graph in the T2T checkpoint file. This means that the " "checkpoint does not correspond to the model specified in " "SGNMT. Please double-check pred_src_vocab_size, " "pred_trg_vocab_size, and all the t2t_* parameters.") raise AttributeError("Could not initialize TF session.")
def after_save(self, session, global_step_value): """Evaluates and exports the model after a checkpoint is created.""" # Load and cache the path of the most recent checkpoint to avoid duplicate # searches on GCS. logging.info("Checking for checkpoint in %s", self._model_dir) latest_path = saver.latest_checkpoint(self._model_dir) if not latest_path: logging.warning( "Skipping evaluation and export since model has not been " "saved yet.") elif latest_path == self._latest_path: logging.warning( "Skipping evaluation due to same latest checkpoint %s.", latest_path) else: self._latest_path = latest_path self._eval_result = self._eval_fn(name="intermediate_export", checkpoint_path=latest_path) self._export_results = self._export_fn(self._eval_result, checkpoint_path=latest_path)
def testWithDefun(self): num_training_steps = 2 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") for training_continuation in range(3): with ops.Graph().as_default(), self.test_session( graph=ops.get_default_graph()), test_util.device(use_gpu=True): model = MyModel() # Don't actually train so we can test variable values optimizer = adam.AdamOptimizer(0.) root = checkpointable_utils.Checkpoint( optimizer=optimizer, model=model, global_step=training_util.get_or_create_global_step()) checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory) status = root.restore(save_path=checkpoint_path) def train_fn(): @function.defun def _call_model(x): return model(x) with backprop.GradientTape() as tape: loss = _call_model(constant_op.constant([[3.]])) gradients = tape.gradient(loss, model.variables) return optimizer.apply_gradients(zip(gradients, model.variables), global_step=root.global_step) if not context.executing_eagerly(): train_fn = functools.partial( self.evaluate, train_fn()) status.initialize_or_restore() for _ in range(num_training_steps): train_fn() if training_continuation > 0: status.assert_consumed() self.assertAllClose([[42.]], self.evaluate(model.variables[0])) else: self.evaluate(model.variables[0].assign([[42.]])) root.save(file_prefix=checkpoint_prefix) self.assertEqual((training_continuation + 1) * num_training_steps, self.evaluate(root.global_step)) self.assertEqual(training_continuation + 1, self.evaluate(root.save_counter))
def _save_first_checkpoint(keras_model, custom_objects, config): """Save first checkpoint for the keras Estimator. Args: keras_model: an instance of compiled keras model. custom_objects: Dictionary for custom objects. config: Estimator config. Returns: The path where keras model checkpoint is saved. """ # save checkpoint into subdirectory to allow warm start keras_model_dir = os.path.join(config.model_dir, 'keras') # Load weights and save to checkpoint if there is no checkpoint latest_path = saver_lib.latest_checkpoint(keras_model_dir) if not latest_path: keras_weights = None if _any_weight_initialized(keras_model): keras_weights = keras_model.get_weights() if not gfile.IsDirectory(keras_model_dir): gfile.MakeDirs(keras_model_dir) with ops.Graph().as_default(): random_seed.set_random_seed(config.tf_random_seed) training_util.create_global_step() model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model, custom_objects) # save to checkpoint with session.Session(config=config.session_config) as sess: if keras_weights: model.set_weights(keras_weights) # Make update ops and initialize all variables. if not model.train_function: # pylint: disable=protected-access model._make_train_function() K._initialize_variables(sess) # pylint: enable=protected-access saver = saver_lib.Saver() latest_path = os.path.join(keras_model_dir, 'keras_model.ckpt') saver.save(sess, latest_path) return latest_path
def testEvaluateWithFiniteInputs(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'evaluate_with_finite_inputs') # Train a Model to completion: self._train_model(checkpoint_dir, num_steps=300) # Run evaluation. Inputs are fed through input producer for one epoch. all_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) all_labels = constant_op.constant(self._labels, dtype=dtypes.float32) single_input, single_label = training.slice_input_producer( [all_inputs, all_labels], num_epochs=1) inputs, labels = training.batch([single_input, single_label], batch_size=6, allow_smaller_final_batch=True) logits = logistic_classifier(inputs) predictions = math_ops.round(logits) accuracy, update_op = metrics.accuracy(predictions=predictions, labels=labels) checkpoint_path = saver.latest_checkpoint(checkpoint_dir) final_ops_values = evaluation._evaluate_once( checkpoint_path=checkpoint_path, eval_ops=update_op, final_ops={ 'accuracy': accuracy, 'eval_steps': evaluation._get_or_create_eval_step() }, hooks=[ evaluation._StopAfterNEvalsHook(None), ]) self.assertTrue(final_ops_values['accuracy'] > .99) # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs # each; the 3rd iter evaluates the remaining 4 inputs, and the last one # triggers an error which stops evaluation. self.assertEqual(final_ops_values['eval_steps'], 4)
def before_run(self, run_context): """ Dumps graphs and loads checkpoint if there exits. Called before each call to run(). Args: run_context: A `SessionRunContext` object. Returns: A `SessionRunArgs` object containing global_step. """ # We do write graph and saver_def at the first call of before_run. # We cannot do this in begin, since we let other hooks to change graph and # add variables in begin. Graph is finalized after all begin calls. if self._is_chief and self._first_call: training_util.write_graph( ops.get_default_graph().as_graph_def(add_shapes=True), self._checkpoint_dir, "graph.pbtxt") # dump model details "model_analysis.txt" dump_model_analysis(self._checkpoint_dir) # dump model configs graph = ops.get_default_graph() meta_graph_def = meta_graph.create_meta_graph_def( graph_def=graph.as_graph_def(add_shapes=True), saver_def=self._saver.saver_def) if self._summary_writer is not None: self._summary_writer.add_graph(graph) self._summary_writer.add_meta_graph(meta_graph_def) tf.logging.info("CheckpointSaverHook (before_run): dump graph...") checkpoint_path = saver_lib.latest_checkpoint(self._checkpoint_dir) if checkpoint_path and self._first_call: # reloading model self._saver.restore(run_context.session, checkpoint_path) gs = run_context.session.run(self._global_step) tf.logging.info( "CheckpointSaverHook (before_run): reloading models and reset global_step={}" .format(gs)) StepTimer.reset_init_triggered_step(gs) self._first_call = False self._timer.register_before_run() return tf.train.SessionRunArgs(self._global_step)
def end(self, session=None): super(ExportMonitor, self).end(session=session) latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir) if latest_path is None: logging.info("Skipping export at the end since model has not been saved " "yet.") return if isinstance(self._estimator, core_estimator.Estimator): raise ValueError( "ExportMonitor does not support `tf.estimator.Estimator. `. " "Please pass an ExportStrategy to Experiment instead.") try: self._last_export_dir = self._estimator.export( self.export_dir, exports_to_keep=self.exports_to_keep, signature_fn=self.signature_fn, input_fn=self._input_fn, default_batch_size=self._default_batch_size, input_feature_key=self._input_feature_key, use_deprecated_input_fn=self._use_deprecated_input_fn) except RuntimeError: logging.info("Skipping exporting for the same step.")
def correlation_matrix(nb_batches, checkpoint_dir): """Computes logits and labels of the input posts and save them as numpy files. Parameters: checkpoint_dir: Checkpoint of the saved model during training. """ with tf.Graph().as_default(): config = _CONFIG.copy() config['mode'] = 'validation' model = DeepSentiment(config) # Load model checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) scaffold = monitored_session.Scaffold(init_op=None, init_feed_dict=None, init_fn=None, saver=None) session_creator = monitored_session.ChiefSessionCreator( scaffold=scaffold, checkpoint_filename_with_path=checkpoint_path, master='', config=None) posts_logits = [] posts_labels = [] with monitored_session.MonitoredSession( # Generate queue session_creator=session_creator, hooks=None) as session: for i in range(nb_batches): np_logits, np_labels = session.run( [model.logits, model.labels]) posts_logits.append(np_logits) posts_labels.append(np_labels) posts_logits, posts_labels = np.vstack(posts_logits), np.hstack( posts_labels) np.save('data/posts_logits.npy', posts_logits) np.save('data/posts_labels.npy', posts_labels) return posts_logits, posts_labels
def _testSaveRestoreFromTensorsUtility(self, start, break_range, stop): path = self._iterator_checkpoint_prefix() step = 0 meta_filename = path + "-%d.meta" % step components = (np.array(1), np.array([1, 2, 3]), np.array(37.0)) with ops.Graph().as_default() as g: iterator = ( dataset_ops.Dataset.from_tensors(components) .make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() saveable = iterator_ops.make_saveable_from_iterator(iterator) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) for t in nest.flatten(get_next): ops.add_to_collection("get_next", t) saver = saver_lib.Saver() with self.test_session(graph=g) as sess: sess.run(init_op) for _ in range(start, break_range): result = sess.run(get_next) for component, result_component in zip(components, result): self.assertAllEqual(component, result_component) saver.save(sess, path, step) with ops.Graph().as_default() as g: saver = saver_lib.import_meta_graph(meta_filename) with self.test_session(graph=g) as sess: get_next = nest.pack_sequence_as(("a", "b", "c"), ops.get_collection("get_next")) saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir())) for _ in range(break_range, stop): result = sess.run(get_next) for component, result_component in zip(components, result): self.assertAllEqual(component, result_component) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testUsageGraph(self): """Expected usage when graph building.""" with context.graph_mode(): num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") for training_continuation in range(3): with ops.Graph().as_default(): model = MyModel() optimizer = adam.AdamOptimizer(0.001) root = checkpointable_utils.Checkpoint( optimizer=optimizer, model=model, global_step=training_util.get_or_create_global_step()) input_value = constant_op.constant([[3.]]) train_op = optimizer.minimize(model(input_value), global_step=root.global_step) checkpoint_path = core_saver.latest_checkpoint( checkpoint_directory) with self.test_session( graph=ops.get_default_graph()) as session: status = root.restore(save_path=checkpoint_path) status.initialize_or_restore(session=session) if checkpoint_path is None: self.assertEqual(0, training_continuation) with self.assertRaises(AssertionError): status.assert_consumed() else: status.assert_consumed() for _ in range(num_training_steps): session.run(train_op) root.save(file_prefix=checkpoint_prefix, session=session) self.assertEqual( (training_continuation + 1) * num_training_steps, session.run(root.global_step)) self.assertEqual(training_continuation + 1, session.run(root.save_counter))
def evaluate(eval_file,model_dir,summary_dir,train_steps): hp = hparam.create_hparam() eval_graph = tf.Graph() with eval_graph.as_default(): input_features = HRAN.create_input_layer(mode=modekeys.EVAL,filename=eval_file,hp=hp) ppl = HRAN.impl(features=input_features,hp=hp,mode=modekeys.EVAL) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) saver = tf.train.Saver() checkpoint = saver_lib.latest_checkpoint(model_dir) saver.restore(sess=sess,save_path=checkpoint) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess,coord=coord) tf.logging.info('Begin evaluation') try: total_ppl = 0 eval_step = 0 while not coord.should_stop(): perplexity = sess.run(fetches=ppl) total_ppl += perplexity eval_step += 1 except tf.errors.OutOfRangeError: avg_ppl = total_ppl / eval_step tf.logging.info('Finish evaluation. The perplexity is {}'.format(avg_ppl)) write_to_summary(summary_dir, 'eval_ppl', avg_ppl, train_steps) finally: coord.request_stop() coord.join(threads) return avg_ppl
def evaluate_and_export(self): """Evaluate and (maybe) export the current model. Returns: Evaluation results. Returns `None` if current round of evaluation is skipped. """ latest_ckpt_path = saver.latest_checkpoint( self._estimator.model_dir) if not latest_ckpt_path: self._log_err_msg( 'Estimator is not trained yet. Will start an ' 'evaluation when a checkpoint is ready.') return None if latest_ckpt_path == self._previous_ckpt_path: self._log_err_msg( 'No new checkpoint ready for evaluation. Skip the current ' 'evaluation pass as evaluation results are expected to be same ' 'for the same checkpoint.') return None eval_result = self._estimator.evaluate( input_fn=self._eval_spec.input_fn, steps=self._eval_spec.steps, name=self._eval_spec.name, checkpoint_path=latest_ckpt_path, hooks=self._eval_spec.hooks) if not eval_result: self._log_err_msg('Estimator evaluate returns empty result.') return None # TODO(b/65169058): Adds export once export strategies are moved. self._last_warning_time = 0 self._previous_ckpt_path = latest_ckpt_path return eval_result
def _evaluate_model(self, input_fn, steps, feed_fn=None, metrics=None): if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'): return checkpoint_path = saver.latest_checkpoint(self._model_dir) eval_dir = os.path.join(self._model_dir, 'eval') with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) eval_dict = self._get_eval_ops(features, targets, metrics or self._get_default_metric_functions()) eval_results, _ = evaluate( graph=g, output_dir=eval_dir, checkpoint_path=checkpoint_path, eval_dict=eval_dict, global_step_tensor=global_step, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps) return eval_results
def every_n_step_end(self, step, outputs): super(RegressionMonitor, self).every_n_step_end(step, outputs) # does it do anything now ? # TODO(mdan): The use of step below is probably misleading. # The code should probably use the step from the checkpoint, because # that's what is being evaluated. if self._estimator is None: raise ValueError("Missing call to set_estimator.") # Check that we are not running evaluation on the same checkpoint. latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir) if latest_path is None: logging.debug("Skipping evaluation since model has not been saved yet " "at step %d.", step) return False if latest_path is not None and latest_path == self._latest_path: logging.debug("Skipping evaluation due to same checkpoint %s for step %d " "as for step %d.", latest_path, step, self._latest_path_step) return False self._latest_path = latest_path self._latest_path_step = step # Run evaluation and log it. stats = evaluate(self._estimator, self.x, self.y) print ( "Validation (step %d): AVG_ERR: %s %% MAX_ERR: %s %%" % (step, \ stats['relative_avg_err'] * 100, stats['relative_max_err'] * 100)) if (step / 1000) % (self._criteria_check_interval / 1000) == 0: # Stopping after not receiving progress bigger than 0.01% after 10k steps. if stats['relative_avg_err'] * 100 > \ self._past_best_big_checkpoint - self._minimal_improvement_treshold: print("The relative average error is not improving. Stopping after %d steps" % step) return True else: print("The relative average error improved from %s %% to %s %% after 10k steps" \ % (self._past_best_big_checkpoint, stats['relative_avg_err']*100)) self._past_best_big_checkpoint = stats['relative_avg_err'] * 100 return False
def _save_first_checkpoint(keras_model, estimator, custom_objects, keras_weights): """Save first checkpoint for the keras Estimator. Args: keras_model: an instance of compiled keras model. estimator: keras estimator. custom_objects: Dictionary for custom objects. keras_weights: A flat list of Numpy arrays for weights of given keras_model. Returns: The model_fn for a keras Estimator. """ # Load weights and save to checkpoint if there is no checkpoint latest_path = saver_lib.latest_checkpoint(estimator.model_dir) if not latest_path: with ops.Graph().as_default(): random_seed.set_random_seed(estimator.config.tf_random_seed) training_util.create_global_step() model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model, custom_objects) if isinstance(model, models.Sequential): model = model.model # save to checkpoint with session.Session(config=estimator._session_config) as sess: model.set_weights(keras_weights) # Make update ops and initialize all variables. if not model.train_function: # pylint: disable=protected-access model._make_train_function() K._initialize_variables(sess) # pylint: enable=protected-access saver = saver_lib.Saver() saver.save( sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
def predict(self, input_fn, predict_keys=None, hooks=None, checkpoint_path=None): """Returns predictions for given features. Args: input_fn: Input function returning features which is a dictionary of string feature name to `Tensor` or `SparseTensor`. If it returns a tuple, first item is extracted as features. Prediction continues until `input_fn` raises an end-of-input exception (`OutOfRangeError` or `StopIteration`). predict_keys: list of `str`, name of the keys to predict. It is used if the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used then rest of the predictions will be filtered from the dictionary. If `None`, returns all. hooks: List of `SessionRunHook` subclass instances. Used for callbacks inside the prediction call. checkpoint_path: Path of a specific checkpoint to predict. If `None`, the latest checkpoint in `model_dir` is used. Yields: Evaluated values of `predictions` tensors. Raises: ValueError: Could not find a trained model in model_dir. ValueError: if batch length of predictions are not same. ValueError: If there is a conflict between `predict_keys` and `predictions`. For example if `predict_keys` is not `None` but `EstimatorSpec.predictions` is not a `dict`. """ hooks = _check_hooks_type(hooks) # Check that model has been trained. if not checkpoint_path: checkpoint_path = saver.latest_checkpoint(self._model_dir) if not checkpoint_path: raise ValueError( 'Could not find trained model in model_dir: {}.'.format( self._model_dir)) with ops.Graph().as_default() as g: random_seed.set_random_seed(self._config.tf_random_seed) training.create_global_step(g) features = self._get_features_from_input_fn(input_fn) estimator_spec = self._call_model_fn(features, None, model_fn_lib.ModeKeys.PREDICT) predictions = self._extract_keys(estimator_spec.predictions, predict_keys) with training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, scaffold=estimator_spec.scaffold, config=self._session_config), hooks=hooks) as mon_sess: while not mon_sess.should_stop(): preds_evaluated = mon_sess.run(predictions) if not isinstance(predictions, dict): for pred in preds_evaluated: yield pred else: for i in range( self._extract_batch_length(preds_evaluated)): yield { key: value[i] for key, value in six.iteritems( preds_evaluated) }
def export_savedmodel(self, export_dir_base, serving_input_receiver_fn, assets_extra=None, as_text=False, checkpoint_path=None): """Exports inference graph as a SavedModel into given dir. This method builds a new graph by first calling the serving_input_receiver_fn to obtain feature `Tensor`s, and then calling this `Estimator`'s model_fn to generate the model graph based on those features. It restores the given checkpoint (or, lacking that, the most recent checkpoint) into this graph in a fresh session. Finally it creates a timestamped export directory below the given export_dir_base, and writes a `SavedModel` into it containing a single `MetaGraphDef` saved from this session. The exported `MetaGraphDef` will provide one `SignatureDef` for each element of the export_outputs dict returned from the model_fn, named using the same keys. One of these keys is always signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which signature will be served when a serving request does not specify one. For each signature, the outputs are provided by the corresponding `ExportOutput`s, and the inputs are always the input receivers provided by the serving_input_receiver_fn. Extra assets may be written into the SavedModel via the extra_assets argument. This should be a dict, where each key gives a destination path (including the filename) relative to the assets.extra directory. The corresponding value gives the full path of the source file to be copied. For example, the simple case of copying a single file without renaming it is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`. Args: export_dir_base: A string containing a directory in which to create timestamped subdirectories containing exported SavedModels. serving_input_receiver_fn: A function that takes no argument and returns a `ServingInputReceiver`. assets_extra: A dict specifying how to populate the assets.extra directory within the exported SavedModel, or `None` if no extra assets are needed. as_text: whether to write the SavedModel proto in text format. checkpoint_path: The checkpoint path to export. If `None` (the default), the most recent checkpoint found within the model directory is chosen. Returns: The string path to the exported directory. Raises: ValueError: if no serving_input_receiver_fn is provided, no export_outputs are provided, or no checkpoint can be found. """ if serving_input_receiver_fn is None: raise ValueError('serving_input_receiver_fn must be defined.') with ops.Graph().as_default() as g: training.create_global_step(g) random_seed.set_random_seed(self._config.tf_random_seed) serving_input_receiver = serving_input_receiver_fn() # Call the model_fn and collect the export_outputs. estimator_spec = self._call_model_fn( features=serving_input_receiver.features, labels=None, mode=model_fn_lib.ModeKeys.PREDICT) # Build the SignatureDefs from receivers and all outputs signature_def_map = build_all_signature_defs( serving_input_receiver.receiver_tensors, estimator_spec.export_outputs) if not checkpoint_path: # Locate the latest checkpoint checkpoint_path = saver.latest_checkpoint(self._model_dir) if not checkpoint_path: raise ValueError("Couldn't find trained model at %s." % self._model_dir) export_dir = get_timestamped_export_dir(export_dir_base) # TODO(soergel): Consider whether MonitoredSession makes sense here with tf_session.Session() as session: saver_for_restore = estimator_spec.scaffold.saver or saver.Saver( sharded=True) saver_for_restore.restore(session, checkpoint_path) # TODO(b/36111876): replace legacy_init_op with main_op mechanism # pylint: disable=protected-access local_init_op = ( estimator_spec.scaffold.local_init_op or monitored_session.Scaffold._default_local_init_op()) # pylint: enable=protected-access # Perform the export builder = saved_model_builder.SavedModelBuilder(export_dir) builder.add_meta_graph_and_variables( session, [tag_constants.SERVING], signature_def_map=signature_def_map, assets_collection=ops.get_collection( ops.GraphKeys.ASSET_FILEPATHS), legacy_init_op=local_init_op) builder.save(as_text) # Add the extra assets if assets_extra: assets_extra_path = os.path.join( compat.as_bytes(export_dir), compat.as_bytes('assets.extra')) for dest_relative, source in assets_extra.items(): dest_absolute = os.path.join( compat.as_bytes(assets_extra_path), compat.as_bytes(dest_relative)) dest_path = os.path.dirname(dest_absolute) gfile.MakeDirs(dest_path) gfile.Copy(source, dest_absolute) return export_dir
def _continuous_eval(self, input_fn, name, delay_secs, throttle_delay_secs, evaluate_checkpoint_only_once=True, continuous_eval_predicate_fn=None): """Run continuous eval. Runs infinite eval on the evaluation data set. This function starts evaluating after `delay_secs` seconds and then runs no more than one evaluation (with `self._eval_steps` steps each time) per `throttle_delay_secs`. If `train_steps` is not None, will return after global_step reaches `train_steps`. Args: input_fn: The input to use for this eval. name: A string appended to the folder name of evaluation results. delay_secs: Start evaluating after this many seconds. If None, defaults to self._eval_delay_secs. throttle_delay_secs: Do not re-evaluate unless the last evaluation was started at least this many seconds ago. If None, defaults to self._continuous_eval_throttle_secs. evaluate_checkpoint_only_once: Whether to skip evaluation of checkpoints that have already been evaluated. Default is `True`. continuous_eval_predicate_fn: A predicate function determining whether to continue eval after each iteration. `predicate_fn` takes the evaluation results as arguments. At the beginning of evaluation, the passed eval results will be None so it's expected that the predicate function handles that gracefully. When `predicate_fn` is not specified, continuous eval will run in an infinite loop (if `train_steps` is None) or exit once global step reaches `train_steps`. Raises: ValueError: if `continuous_eval_predicate_fn` is neither None nor callable. """ if (continuous_eval_predicate_fn is not None and not callable(continuous_eval_predicate_fn)): raise ValueError( "`continuous_eval_predicate_fn` must be a callable, or None.") if delay_secs is None: delay_secs = self._eval_delay_secs if throttle_delay_secs is None: throttle_delay_secs = self._continuous_eval_throttle_secs if delay_secs: logging.info("Waiting %f secs before starting eval.", delay_secs) time.sleep(delay_secs) previous_path = None eval_result = None last_warning_time = 0 while (not continuous_eval_predicate_fn or continuous_eval_predicate_fn(eval_result)): # Exit if we have already reached number of steps to train. if self._has_training_stopped(eval_result): logging.info("Exiting continuous eval, global_step=%s >= " "train_step=%s", eval_result[ops.GraphKeys.GLOBAL_STEP], self._train_steps) return start = time.time() error_msg = None latest_path = saver.latest_checkpoint(self._estimator.model_dir) if not latest_path: error_msg = ("Estimator is not fitted yet. " "Will start an evaluation when a checkpoint is ready.") elif evaluate_checkpoint_only_once and latest_path == previous_path: error_msg = "No new checkpoint ready for evaluation." if error_msg: # Print warning message every 10 mins. eval_result = {} if time.time() - last_warning_time > 600: logging.warning(error_msg) last_warning_time = time.time() else: eval_result = self._call_evaluate(input_fn=input_fn, steps=self._eval_steps, metrics=self._eval_metrics, name=name, checkpoint_path=latest_path, hooks=self._eval_hooks) # Ensure eval result is not None for next round of evaluation. if not eval_result: eval_result = {} self._maybe_export(eval_result, checkpoint_path=latest_path) # Clear warning timer and update last evaluated checkpoint last_warning_time = 0 previous_path = latest_path duration = time.time() - start if duration < throttle_delay_secs: difference = throttle_delay_secs - duration logging.info("Waiting %f secs before starting next eval run.", difference) time.sleep(difference)