def StartMultiplexerReloadingThread(multiplexer, path_to_run, load_interval): """Starts a thread to automatically reload the given multiplexer. The thread will reload the multiplexer by calling `ReloadMultiplexer` every `load_interval` seconds, starting immediately. Args: multiplexer: The `EventMultiplexer` to add runs to and reload. path_to_run: A dict mapping from paths to run names, where `None` as the run name is interpreted as a run name equal to the path. load_interval: How many seconds to wait after one load before starting the next load. Returns: A started `threading.Thread` that reloads the multiplexer. """ # We don't call multiplexer.Reload() here because that would make # AddRunsFromDirectory block until the runs have all loaded. for path in path_to_run.keys(): if gcs.IsGCSPath(path): gcs.CheckIsSupported() logging.info( 'Assuming %s is intended to be a Google Cloud Storage path because ' 'it starts with %s. If it isn\'t, prefix it with \'/.\' (i.e., use ' '/.%s instead)', path, gcs.PATH_PREFIX, path) def _ReloadForever(): while True: ReloadMultiplexer(multiplexer, path_to_run) time.sleep(load_interval) thread = threading.Thread(target=_ReloadForever) thread.daemon = True thread.start() return thread
def __exit__(self, exc_type, exc_val, exc_tb): logging.info('Disabling worker watchdog.') self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,))) self._running = False self.join()
def evaluate(self, delay_secs=None): """Evaluate on the evaluation data. Runs evaluation on the evaluation data and returns the result. Runs for `self._eval_steps` steps, or if it's `None`, then run until input is exhausted or another exception is raised. Start the evaluation after `delay_secs` seconds, or if it's `None`, defaults to using `self._eval_delay_secs` seconds. Args: delay_secs: Start evaluating after this many seconds. If `None`, defaults to using `self._eval_delays_secs`. Returns: The result of the `evaluate` call to the `Estimator`. """ if delay_secs is None: delay_secs = self._eval_delay_secs if delay_secs: logging.info("Waiting %d secs before starting eval.", delay_secs) time.sleep(delay_secs) return self._call_evaluate(input_fn=self._eval_input_fn, steps=self._eval_steps, metrics=self._eval_metrics, name="one_pass", hooks=self._eval_hooks)
def testGeneratesStacktrace(self): if FLAGS.child: return # Subprocess sys.argv[0] with --child=True if sys.executable: child_process = subprocess.Popen( [sys.executable, sys.argv[0], '--child=True'], cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: child_process = subprocess.Popen( [sys.argv[0], '--child=True'], cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Capture its output. capture both stdout and stderr and append them. # We are not worried about timing or order of messages in this test. child_stdout, child_stderr = child_process.communicate() child_output = child_stdout + child_stderr # Make sure the child process is dead before we proceed. child_process.wait() logging.info('Output from the child process:') logging.info(child_output) # Verify a stack trace is printed. self.assertIn(b'PyEval_EvalFrame', child_output)
def add_gradients_summaries(grads_and_vars): """Add summaries to gradients. Args: grads_and_vars: A list of gradient to variable pairs (tuples). Returns: The list of created summaries. """ summaries = [] for grad, var in grads_and_vars: if grad is not None: if isinstance(grad, ops.IndexedSlices): grad_values = grad.values else: grad_values = grad summaries.append( summary.histogram(var.op.name + '_gradient', grad_values)) summaries.append( summary.scalar(var.op.name + '_gradient_norm', clip_ops.global_norm([grad_values]))) else: logging.info('Var %s has no gradient', var.op.name) return summaries
def testCustomGradient(self): dtype = dtypes.float32 @function.Defun(dtype, dtype, dtype) def XentLossGrad(logits, labels, dloss): dlogits = array_ops.reshape(dloss, [-1, 1]) * ( nn_ops.softmax(logits) - labels) dlabels = array_ops.zeros_like(labels) # Takes exp(dlogits) to differentiate it from the "correct" gradient. return math_ops.exp(dlogits), dlabels @function.Defun(dtype, dtype, grad_func=XentLossGrad) def XentLoss(logits, labels): return math_ops.reduce_sum(labels * math_ops.log(nn_ops.softmax(logits)), 1) g = ops.Graph() with g.as_default(): logits = array_ops.placeholder(dtype) labels = array_ops.placeholder(dtype) loss = XentLoss(logits, labels) dlogits = gradients_impl.gradients([loss], [logits]) x = np.random.uniform(-10., 10., size=(4, 9)).astype(np.float32) prob = np.exp(x) / np.sum(np.exp(x), 1, keepdims=1) y = np.random.uniform(-10., 10., size=(4, 9)).astype(np.float32) for cfg in _OptimizerOptions(): tf_logging.info("cfg = %s", cfg) with session.Session(graph=g, config=cfg) as sess: out, = sess.run(dlogits, {logits: x, labels: y}) self.assertAllClose(out, np.exp(prob - y))
def setUpClass(cls): gpu_memory_fraction_opt = ( "--gpu_memory_fraction=%f" % cls.PER_PROC_GPU_MEMORY_FRACTION) worker_port = portpicker.pick_unused_port() cluster_spec = "worker|localhost:%d" % worker_port tf_logging.info("cluster_spec: %s", cluster_spec) server_bin = test.test_src_dir_path("python/debug/grpc_tensorflow_server") cls.server_target = "grpc://localhost:%d" % worker_port cls.server_procs = {} cls.server_procs["worker"] = subprocess.Popen( [ server_bin, "--cluster_spec=%s" % cluster_spec, "--job_name=worker", "--task_id=0", gpu_memory_fraction_opt, ], stdout=sys.stdout, stderr=sys.stderr) # Start debug server in-process, on separate thread. (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread, cls.debug_server ) = grpc_debug_test_server.start_server_on_separate_thread( dump_to_filesystem=False) tf_logging.info("debug server url: %s", cls.debug_server_url) cls.session_config = config_pb2.ConfigProto( gpu_options=config_pb2.GPUOptions( per_process_gpu_memory_fraction=cls.PER_PROC_GPU_MEMORY_FRACTION))
def testUnrollLSTM(self): # Run one step of the unrolled lstm graph. def RunForward(mode, cfg=None): tf_logging.info("mode = %s", mode) g = ops.Graph() start = time.time() with g.as_default(): weights = self._Weights() inp = self._Input() m = self._BuildForward(weights, inp, mode) gdef = g.as_graph_def() finish = time.time() tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start, len(str(gdef)), len(gdef.SerializeToString())) with g.as_default(), session.Session(config=cfg) as sess: return sess.run(m) mv0 = RunForward("complete") for cfg in _OptimizerOptions(): tf_logging.info("cfg = %s", cfg) mv1 = RunForward("cell", cfg) mv2 = RunForward("loop", cfg) mv3 = RunForward("loop10", cfg) self.assertAllClose(mv0, mv1, rtol=1e-4) self.assertAllClose(mv0, mv2, rtol=1e-4) self.assertAllClose(mv0, mv3, rtol=1e-4)
def testUnrollLSTMGrad(self): # Run one step of the unrolled lstm graph. def RunForwardBackward(mode, cfg=None): tf_logging.info("mode = %s", mode) g = ops.Graph() start = time.time() with g.as_default(): weights = self._Weights() inp = self._Input() m = self._BuildForward(weights, inp, mode) loss = math_ops.reduce_sum(math_ops.square(m)) dw = gradients_impl.gradients([loss], [weights]) gdef = g.as_graph_def() finish = time.time() tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start, len(str(gdef)), len(gdef.SerializeToString())) with g.as_default(), session.Session(config=cfg) as sess: return sess.run(dw) d0 = RunForwardBackward("complete") for cfg in _OptimizerOptions(): tf_logging.info("cfg = %s", cfg) d1 = RunForwardBackward("cell", cfg) d2 = RunForwardBackward("loop", cfg) d3 = RunForwardBackward("loop10", cfg) self.assertAllClose(d0, d1, rtol=1e-4, atol=1e-4) self.assertAllClose(d0, d2, rtol=1e-4, atol=1e-4) self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
def build_greedy_training(self, state, network_states): """Extracts features and advances a batch using the oracle path. Args: state: MasterState from the 'AdvanceMaster' op that advances the underlying master to this component. network_states: dictionary of component NetworkState objects Returns: state handle: final state after advancing cost: regularization cost, possibly associated with embedding matrices correct: since no gold path is available, 0. total: since no gold path is available, 0. """ logging.info('Building component: %s', self.spec.name) stride = state.current_batch_size * self.training_beam_size with tf.variable_scope(self.name, reuse=True): state.handle, fixed_embeddings = fetch_differentiable_fixed_embeddings( self, state, stride) linked_embeddings = [ fetch_linked_embedding(self, network_states, spec) for spec in self.spec.linked_feature ] with tf.variable_scope(self.name, reuse=True): tensors = self.network.create( fixed_embeddings, linked_embeddings, None, None, True, stride=stride) update_network_states(self, tensors, network_states, stride) cost = self.add_regularizer(tf.constant(0.)) correct, total = tf.constant(0), tf.constant(0) return state.handle, cost, correct, total
def _extract_feature_ids(self, state, network_states, during_training): """Extracts feature IDs and advances a batch using the oracle path. Args: state: MasterState from the 'AdvanceMaster' op that advances the underlying master to this component. network_states: Dictionary of component NetworkState objects. during_training: Whether the graph is being constructed during training. Returns: state handle: Final state after advancing. """ logging.info('Building component: %s', self.spec.name) if during_training: stride = state.current_batch_size * self.training_beam_size else: stride = state.current_batch_size * self.inference_beam_size with tf.variable_scope(self.name, reuse=True): state.handle, ids = extract_fixed_feature_ids(self, state, stride) with tf.variable_scope(self.name, reuse=True): tensors = self.network.create( ids, [], None, None, during_training, stride=stride) update_network_states(self, tensors, network_states, stride) return state.handle
def __init__(self, model_dir=None, config=None): """Initializes a BaseEstimator instance. Args: model_dir: Directory to save model parameters, graph and etc. This can also be used to load checkpoints from the directory into a estimator to continue training a previously saved model. config: A RunConfig instance. """ # Model directory. self._model_dir = model_dir if self._model_dir is None: self._model_dir = tempfile.mkdtemp() logging.warning('Using temporary folder as model directory: %s', self._model_dir) # Create a run configuration if config is None: self._config = BaseEstimator._Config() logging.warning('Using default config.') else: self._config = config logging.info('Using config: %s', str(vars(self._config))) # Set device function depending if there are replicas or not. self._device_fn = _get_replica_device_setter(self._config) # Features and targets TensorSignature objects. # TODO(wicke): Rename these to something more descriptive self._features_info = None self._targets_info = None self._graph = None
def _infer_model_as_iterable( self, checkpoint_path, predictions, feed_fn, return_dict): if feed_fn is None: feed_dicts = itertools.repeat(None) else: def _feed_fn(): while True: yield feed_fn() feed_dicts = _feed_fn() try: for output_batch in graph_actions.run_feeds_iter( output_dict=predictions, feed_dicts=feed_dicts, restore_checkpoint_path=checkpoint_path): # Unpack batches into individual predictions if return_dict: batch_length = list(output_batch.values())[0].shape[0] for i in range(batch_length): yield {key: value[i] for key, value in output_batch.items()} else: for pred in output_batch['predictions']: yield pred except errors.OutOfRangeError: # We fall out of the above loop naturally if feed_fn raises StopIteration, # or we catch an OutOfRangeError if we've reached the end of inputs. logging.info('Reached end of inputs for predict_iter.')
def _write_dict_to_summary(output_dir, dictionary, current_global_step): """Writes a `dict` into summary file in given output directory. Args: output_dir: `str`, directory to write the summary file in. dictionary: the `dict` to be written to summary file. current_global_step: `int`, the current global step. """ logging.info('Saving dict for global step %d: %s', current_global_step, _dict_to_str(dictionary)) summary_writer = writer_cache.FileWriterCache.get(output_dir) summary_proto = summary_pb2.Summary() for key in dictionary: if dictionary[key] is None: continue if key == 'global_step': continue value = summary_proto.value.add() value.tag = key if (isinstance(dictionary[key], np.float32) or isinstance(dictionary[key], float)): value.simple_value = float(dictionary[key]) elif (isinstance(dictionary[key], np.int64) or isinstance(dictionary[key], np.int32) or isinstance(dictionary[key], int)): value.simple_value = int(dictionary[key]) else: logging.warn( 'Skipping summary for %s, must be a float, np.float32, np.int64, ' 'np.int32 or int.', key) summary_writer.add_summary(summary_proto, current_global_step) summary_writer.flush()
def _get_device_assignment(self): """Gets the (maybe cached) TPU device assignment.""" master = self._get_master_address() device_assignment = self._lazy_device_assignment_dict.get(master) if device_assignment is not None: return device_assignment tpu_system_metadata = self._get_tpu_system_metadata() device_assignment = tpu_device_assignment.device_assignment( tpu_system_metadata.topology, computation_shape=self._computation_shape, num_replicas=self.num_replicas) logging.info('num_cores_per_replica: %s', str(self._config.tpu_config.num_cores_per_replica)) logging.info('computation_shape: %s', str(self._computation_shape)) logging.info('num_replicas: %d', self.num_replicas) logging.info('device_assignment.topology.device_coordinates: %s', str(device_assignment.topology.device_coordinates)) logging.info('device_assignment.core_assignment: %s', str(device_assignment.core_assignment)) self._lazy_device_assignment_dict[master] = device_assignment return device_assignment
def wait_for_new_checkpoint(checkpoint_dir, last_checkpoint, seconds_to_sleep=1, timeout=None): """Waits until a new checkpoint file is found. Args: checkpoint_dir: The directory in which checkpoints are saved. last_checkpoint: The last checkpoint path used. seconds_to_sleep: The number of seconds to sleep for before looking for a new checkpoint. timeout: The maximum amount of time to wait. If left as `None`, then the process will wait indefinitely. Returns: a new checkpoint path, or None if the timeout was reached. """ logging.info('Waiting for new checkpoint at %s', checkpoint_dir) stop_time = time.time() + timeout if timeout is not None else None while True: checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) if checkpoint_path is None or checkpoint_path == last_checkpoint: if stop_time is not None and time.time() + seconds_to_sleep > stop_time: return None time.sleep(seconds_to_sleep) else: logging.info('Found new checkpoint at %s', checkpoint_path) return checkpoint_path
def _restore_checkpoint(self, master, saver=None, checkpoint_dir=None, checkpoint_filename_with_path=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None): """Creates a `Session`, and tries to restore a checkpoint. Args: master: `String` representation of the TensorFlow master to use. saver: A `Saver` object used to restore a model. checkpoint_dir: Path to the checkpoint files. The latest checkpoint in the dir will be used to restore. checkpoint_filename_with_path: Full file name path to the checkpoint file. wait_for_checkpoint: Whether to wait for checkpoint to become available. max_wait_secs: Maximum time to wait for checkpoints to become available. config: Optional `ConfigProto` proto used to configure the session. Returns: A pair (sess, is_restored) where 'is_restored' is `True` if the session could be restored, `False` otherwise. Raises: ValueError: If both checkpoint_dir and checkpoint_filename_with_path are set. """ self._target = master sess = session.Session(self._target, graph=self._graph, config=config) if checkpoint_dir and checkpoint_filename_with_path: raise ValueError("Can not provide both checkpoint_dir and " "checkpoint_filename_with_path.") # If either saver or checkpoint_* is not specified, cannot restore. Just # return. if not saver or not (checkpoint_dir or checkpoint_filename_with_path): return sess, False if checkpoint_filename_with_path: saver.restore(sess, checkpoint_filename_with_path) return sess, True # Waits up until max_wait_secs for checkpoint to become available. wait_time = 0 ckpt = checkpoint_management.get_checkpoint_state(checkpoint_dir) while not ckpt or not ckpt.model_checkpoint_path: if wait_for_checkpoint and wait_time < max_wait_secs: logging.info("Waiting for checkpoint to be available.") time.sleep(self._recovery_wait_secs) wait_time += self._recovery_wait_secs ckpt = checkpoint_management.get_checkpoint_state(checkpoint_dir) else: return sess, False # Loads the checkpoint. saver.restore(sess, ckpt.model_checkpoint_path) saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths) return sess, True
def fit(self, **kwargs): self.fake_checkpoint() tf_logging.info('fit called with args: %s' % kwargs) self.fit_count += 1 if 'monitors' in kwargs: self.monitors = kwargs['monitors'] return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
def testEval(self): if not is_tensorrt_enabled(): return model_dir = test.test_src_dir_path('python/compiler/tensorrt/test/testdata') accuracy_tf_native = self._Run( is_training=False, use_trt=False, batch_size=128, num_epochs=None, model_dir=model_dir)['accuracy'] logging.info('accuracy_tf_native: %f', accuracy_tf_native) self.assertAllClose(0.9662, accuracy_tf_native, rtol=3e-3, atol=3e-3) if get_linked_tensorrt_version()[0] < 5: return accuracy_tf_trt = self._Run( is_training=False, use_trt=True, batch_size=128, num_epochs=None, model_dir=model_dir)['accuracy'] logging.info('accuracy_tf_trt: %f', accuracy_tf_trt) self.assertAllClose(0.9675, accuracy_tf_trt, rtol=1e-3, atol=1e-3)
def _maybe_save_assets(assets_collection_to_add=None): """Saves assets to the meta graph. Args: assets_collection_to_add: The collection where the asset paths are setup. Returns: The list of filepaths to the assets in the assets collection. Raises: ValueError: Indicating an invalid filepath tensor. """ asset_source_filepath_list = [] if assets_collection_to_add is None: tf_logging.info("No assets to save.") return asset_source_filepath_list # Iterate over the supplied asset collection, build the `AssetFile` proto # and add them to the collection with key `constants.ASSETS_KEY`, in the # graph. for asset_tensor in assets_collection_to_add: asset_source_filepath = _asset_path_from_tensor(asset_tensor) if not asset_source_filepath: raise ValueError("Invalid asset filepath tensor %s" % asset_tensor) asset_source_filename = os.path.basename(asset_source_filepath) # Build `AssetFile` proto and add it to the asset collection in the graph. _add_asset_to_collection(asset_source_filename, asset_tensor) asset_source_filepath_list.append(asset_source_filepath) tf_logging.info("Assets added to graph.") return asset_source_filepath_list
def _save_and_write_assets(self, assets_collection_to_add=None): """Saves asset to the meta graph and writes asset files to disk. Args: assets_collection_to_add: The collection where the asset paths are setup. """ asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add) # Return if there are no assets to write. if len(asset_source_filepath_list) is 0: tf_logging.info("No assets to write.") return assets_destination_dir = os.path.join( compat.as_bytes(self._export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY)) if not file_io.file_exists(assets_destination_dir): file_io.recursive_create_dir(assets_destination_dir) # Copy each asset from source path to destination path. for asset_source_filepath in asset_source_filepath_list: asset_source_filename = os.path.basename(asset_source_filepath) asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_source_filename)) # Only copy the asset file to the destination if it does not already # exist. This is to ensure that an asset with the same name defined as # part of multiple graphs is only copied the first time. if not file_io.file_exists(asset_destination_filepath): file_io.copy(asset_source_filepath, asset_destination_filepath) tf_logging.info("Assets written to: %s", assets_destination_dir)
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None): """Initialize CheckpointSaverHook monitor. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. Raises: ValueError: One of `save_steps` or `save_secs` should be set. """ logging.info("Create CheckpointSaverHook.") self._saver = saver self._checkpoint_dir = checkpoint_dir self._summary_writer = SummaryWriterCache.get(checkpoint_dir) self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._save_secs = save_secs self._save_steps = save_steps self._last_saved_time = None self._last_saved_step = None if save_steps is None and save_secs is None: raise ValueError("Either save_steps or save_secs should be provided") if (save_steps is not None) and (save_secs is not None): raise ValueError("Can not provide both save_steps and save_secs.")
def train(self, delay_secs=None): """Fit the estimator using the training data. Train the estimator for `self._train_steps` steps, after waiting for `delay_secs` seconds. If `self._train_steps` is `None`, train forever. Args: delay_secs: Start training after this many seconds. Returns: The trained estimator. """ if delay_secs is None: task_id = 0 if hasattr(FLAGS, "task"): task_id = FLAGS.task delay_secs = min(60, task_id*5) if delay_secs: logging.info("Waiting %d secs before starting training.", delay_secs) time.sleep(delay_secs) return self._estimator.fit(input_fn=self._train_input_fn, max_steps=self._train_steps, monitors=self._train_monitors)
def add_saver(self): """Adds a Saver for all variables in the graph.""" logging.info('Generating op to save variables:\n\t%s', '\n\t'.join([x.name for x in tf.global_variables()])) self.saver = tf.train.Saver( var_list=[x for x in tf.global_variables()], write_version=saver_pb2.SaverDef.V1)
def _restore_from_checkpoint(session, graph, checkpoint_path, saver=None): logging.info('Loading model from checkpoint: %s.', checkpoint_path) saver = saver or _make_saver(graph) if saver: saver.restore(session, checkpoint_path) else: logging.info('No variables found in graph, not creating Saver() object.')
def __init__( self, params, device_assigner=None, variables=None, tree_variables_class=TreeTrainingVariables, tree_graphs=None, training=True, t_ops=training_ops, i_ops=inference_ops, ): self.params = params self.device_assigner = device_assigner or RandomForestDeviceAssigner() logging.info("Constructing forest with params = ") logging.info(self.params.__dict__) self.variables = variables or ForestTrainingVariables( self.params, device_assigner=self.device_assigner, training=training, tree_variables_class=tree_variables_class, ) tree_graph_class = tree_graphs or RandomTreeGraphs self.trees = [ tree_graph_class(self.variables[i], self.params, t_ops.Load(), i_ops.Load(), i) for i in range(self.params.num_trees) ]
def _get_model_dir(tf_config, model_dir): """Returns `model_dir` based user provided `tf_config` or `model_dir`.""" # pylint: disable=g-explicit-bool-comparison # Empty string is treated as False in Python condition check, which triggers # some confusing error messages. For example, 'a or b' returns None if a is '' # and b is None. `None` is allowed for model_dir but '' is not allowed. Here, # explicitly check empty string to provide clear error message. if model_dir == '': raise ValueError('model_dir should be non-empty.') model_dir_in_tf_config = tf_config.get('model_dir') if model_dir_in_tf_config == '': raise ValueError('model_dir in TF_CONFIG should be non-empty.') if model_dir_in_tf_config: if model_dir and model_dir_in_tf_config != model_dir: raise ValueError( '`model_dir` provided in RunConfig construct, if set, ' 'must have the same value as the model_dir in TF_CONFIG. ' 'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format( model_dir, model_dir_in_tf_config)) logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config) return model_dir or model_dir_in_tf_config
def RunTraining(self, hyperparam_config): master_spec = self.LoadSpec('master_spec_link.textproto') self.assertTrue(isinstance(hyperparam_config, spec_pb2.GridPoint)) gold_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_GOLD_SENTENCE, gold_doc) gold_doc_2 = sentence_pb2.Sentence() text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2) reader_strings = [ gold_doc.SerializeToString(), gold_doc_2.SerializeToString() ] tf.logging.info('Generating graph with config: %s', hyperparam_config) with tf.Graph().as_default(): builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) target = spec_pb2.TrainTarget() target.name = 'testTraining-all' train = builder.add_training_from_config(target) with self.test_session() as sess: logging.info('Initializing') sess.run(tf.global_variables_initializer()) # Run one iteration of training and verify nothing crashes. logging.info('Training') sess.run(train['run'], feed_dict={train['input_batch']: reader_strings})
def _initialize_local(self, num_gpus_per_worker): """Initialize internal devices for local training.""" self._worker_device = "/job:localhost" # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus_per_worker > 0: self._compute_devices = list( map("/device:GPU:{}".format, range(num_gpus_per_worker))) else: self._compute_devices = [_LOCAL_CPU] self._compute_devices = list( map(device_util.resolve, self._compute_devices)) self._canonical_compute_device_set = set(self._compute_devices) # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if num_gpus_per_worker == 1: assert len(list(self._compute_devices)) == 1 self._variable_device = _LOCAL_GPU_0 self._parameter_devices = [_LOCAL_GPU_0] else: self._variable_device = _LOCAL_CPU self._parameter_devices = [_LOCAL_CPU] self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy with compute_devices = %r, " "variable_device = %r", self._compute_devices, self._variable_device)
def save(self, as_text=False): """Writes a `SavedModel` protocol buffer to disk. The function writes the SavedModel protocol buffer to the export directory in serialized format. Args: as_text: Writes the SavedModel protocol buffer in text format to disk. Returns: The path to which the SavedModel protocol buffer was written. """ if not file_io.file_exists(self._export_dir): file_io.recursive_create_dir(self._export_dir) if as_text: path = os.path.join( compat.as_bytes(self._export_dir), compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT)) file_io.write_string_to_file(path, str(self._saved_model)) else: path = os.path.join( compat.as_bytes(self._export_dir), compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB)) file_io.write_string_to_file(path, self._saved_model.SerializeToString()) tf_logging.info("SavedModel written to: %s", path) return path
def __init__(self, model_fn, model_dir=None, config=None, params=None, warm_start_from=None): """Constructs an `Estimator` instance. See @{$estimators} for more information. To warm-start an `Estimator`: ```python estimator = tf.estimator.DNNClassifier( feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb], hidden_units=[1024, 512, 256], warm_start_from="/path/to/checkpoint/dir") ``` For more details on warm-start configuration, see @{tf.estimator.WarmStartSettings$WarmStartSettings}. Args: model_fn: Model function. Follows the signature: * Args: * `features`: This is the first item returned from the `input_fn` passed to `train`, `evaluate`, and `predict`. This should be a single `Tensor` or `dict` of same. * `labels`: This is the second item returned from the `input_fn` passed to `train`, `evaluate`, and `predict`. This should be a single `Tensor` or `dict` of same (for multi-head models). If mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If the `model_fn`'s signature does not accept `mode`, the `model_fn` must still be able to handle `labels=None`. * `mode`: Optional. Specifies if this training, evaluation or prediction. See `ModeKeys`. * `params`: Optional `dict` of hyperparameters. Will receive what is passed to Estimator in `params` parameter. This allows to configure Estimators from hyper parameter tuning. * `config`: Optional configuration object. Will receive what is passed to Estimator in `config` parameter, or the default `config`. Allows updating things in your model_fn based on configuration such as `num_ps_replicas`, or `model_dir`. * Returns: `EstimatorSpec` model_dir: Directory to save model parameters, graph and etc. This can also be used to load checkpoints from the directory into a estimator to continue training a previously saved model. If `PathLike` object, the path will be resolved. If `None`, the model_dir in `config` will be used if set. If both are set, they must be same. If both are `None`, a temporary directory will be used. config: Configuration object. params: `dict` of hyper parameters that will be passed into `model_fn`. Keys are names of parameters, values are basic python types. warm_start_from: Optional string filepath to a checkpoint to warm-start from, or a `tf.estimator.WarmStartSettings` object to fully configure warm-starting. If the string filepath is provided instead of a `WarmStartSettings`, then all variables are warm-started, and it is assumed that vocabularies and Tensor names are unchanged. Raises: RuntimeError: If eager execution is enabled. ValueError: parameters of `model_fn` don't match `params`. ValueError: if this is called via a subclass and if that class overrides a member of `Estimator`. """ if context.in_eager_mode(): raise RuntimeError( 'Estimators are not supported when eager execution is enabled.') Estimator._assert_members_are_not_overridden(self) if config is None: self._config = run_config.RunConfig() logging.info('Using default config.') else: if not isinstance(config, run_config.RunConfig): raise ValueError( 'config must be an instance of RunConfig, but provided %s.' % config) self._config = config # Model directory. model_dir = compat_internal.path_to_str(model_dir) if (model_dir is not None) and (self._config.model_dir is not None): if model_dir != self._config.model_dir: # TODO(alanyee): remove this suppression after it is no longer needed # pylint: disable=g-doc-exception raise ValueError( "model_dir are set both in constructor and RunConfig, but with " "different values. In constructor: '{}', in RunConfig: " "'{}' ".format(model_dir, self._config.model_dir)) # pylint: enable=g-doc-exception self._model_dir = model_dir or self._config.model_dir if self._model_dir is None: self._model_dir = tempfile.mkdtemp() logging.warning('Using temporary folder as model directory: %s', self._model_dir) if self._config.model_dir is None: self._config = self._config.replace(model_dir=self._model_dir) logging.info('Using config: %s', str(vars(self._config))) if self._config.session_config is None: self._session_config = config_pb2.ConfigProto(allow_soft_placement=True) else: self._session_config = self._config.session_config self._device_fn = _get_replica_device_setter(self._config) if model_fn is None: raise ValueError('model_fn must be provided to Estimator.') _verify_model_fn_args(model_fn, params) self._model_fn = model_fn self._params = copy.deepcopy(params or {}) # pylint: disable=protected-access self._warm_start_settings = ( warm_starting_util._get_default_warm_start_settings(warm_start_from))
def __init__(self, sequence_feature_columns, context_feature_columns=None, units=None, cell_type=USE_DEFAULT, rnn_cell_fn=None, return_sequences=False, model_dir=None, n_classes=2, weight_column=None, label_vocabulary=None, optimizer='Adagrad', loss_reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE, sequence_mask='sequence_mask', config=None): """Initializes a `RNNClassifier` instance. Args: sequence_feature_columns: An iterable containing the `FeatureColumn`s that represent sequential input. All items in the set should either be sequence columns (e.g. `sequence_numeric_column`) or constructed from one (e.g. `embedding_column` with `sequence_categorical_column_*` as input). context_feature_columns: An iterable containing the `FeatureColumn`s for contextual input. The data represented by these columns will be replicated and given to the RNN at each timestep. These columns must be instances of classes derived from `DenseColumn` such as `numeric_column`, not the sequential variants. units: Iterable of integer number of hidden units per RNN layer. If set, `cell_type` must also be specified and `rnn_cell_fn` must be `None`. cell_type: A class producing a RNN cell or a string specifying the cell type. Supported strings are: `'simple_rnn'`, `'lstm'`, and `'gru'`. If set, `units` must also be specified and `rnn_cell_fn` must be `None`. rnn_cell_fn: A function that returns a RNN cell instance that will be used to construct the RNN. If set, `units` and `cell_type` cannot be set. This is for advanced users who need additional customization beyond `units` and `cell_type`. Note that `tf.keras.layers.StackedRNNCells` is needed for stacked RNNs. return_sequences: A boolean indicating whether to return the last output in the output sequence, or the full sequence. Note that if True, `weight_column` must be None or a string. model_dir: Directory to save model parameters, graph and etc. This can also be used to load checkpoints from the directory into a estimator to continue training a previously saved model. n_classes: Number of label classes. Defaults to 2, namely binary classification. Must be > 1. weight_column: A string or a `NumericColumn` created by `tf.feature_column.numeric_column` defining feature column representing weights. It is used to down weight or boost examples during training. It will be multiplied by the loss of the example. If it is a string, it is used as a key to fetch weight tensor from the `features`. If it is a `NumericColumn`, raw tensor is fetched by key `weight_column.key`, then weight_column.normalizer_fn is applied on it to get weight tensor. label_vocabulary: A list of strings represents possible label values. If given, labels must be string type and have any value in `label_vocabulary`. If it is not given, that means labels are already encoded as integer or float within [0, 1] for `n_classes=2` and encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 . Also there will be errors if vocabulary is not provided and labels are string. optimizer: An instance of `tf.Optimizer` or string specifying optimizer type. Defaults to Adagrad optimizer. loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`. sequence_mask: A string with the name of the sequence mask tensor. If `sequence_mask` is in the features dictionary, the provided tensor is used, otherwise the sequence mask is computed from the length of sequential features. The sequence mask is used in evaluation and training mode to aggregate loss and metrics computation while excluding padding steps. It is also added to the predictions dictionary in prediction mode to indicate which steps are padding. config: `RunConfig` object to configure the runtime settings. Note that a RNN cell has: - a `call` method. - a `state_size` attribute. - a `output_size` attribute. - a `get_initial_state` method. See the documentation on `tf.keras.layers.RNN` for more details. Raises: ValueError: If `units`, `cell_type`, and `rnn_cell_fn` are not compatible. """ rnn_layer_fn = _make_rnn_layer_fn(rnn_cell_fn, units, cell_type, return_sequences=return_sequences) if n_classes == 2: head = binary_head_lib.BinaryClassHead( weight_column=weight_column, label_vocabulary=label_vocabulary, loss_reduction=loss_reduction) else: head = multi_head_lib.MultiClassHead( n_classes=n_classes, weight_column=weight_column, label_vocabulary=label_vocabulary, loss_reduction=loss_reduction) if return_sequences: logging.info( 'Converting head to sequential head with ' '`SequentialHeadWrapper` to allow sequential predictions.') head = seq_head_lib.SequentialHeadWrapper( head, sequence_length_mask=sequence_mask, feature_columns=weight_column) def _model_fn(features, labels, mode, config): del config # Unused. return _rnn_model_fn( features=features, labels=labels, mode=mode, head=head, rnn_layer_fn=rnn_layer_fn, sequence_feature_columns=tuple(sequence_feature_columns or []), context_feature_columns=tuple(context_feature_columns or []), return_sequences=return_sequences, optimizer=optimizer) super(RNNClassifier, self).__init__(model_fn=_model_fn, model_dir=model_dir, config=config)
def RunTest(self, run_params): should_run, reason_for_skipping = self.ShouldRunTest(run_params) if not should_run: return self.skipTest(reason_for_skipping) saved_model_dir = self._MakeSavedModel(run_params) np.random.seed(12345) # Fix the seed so the test is deterministic. inputs_data = [] input_specs = self._GetParamsCached().input_specs for dim_list in self._GetParamsCached().input_dims: assert len(input_specs) == len(dim_list) current_input_data = [] for spec, np_shape in zip(input_specs, dim_list): np_dtype = spec.dtype.as_numpy_dtype() # Multiply the input by some constant to avoid all zeros input for # integer types. scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0 # TODO(laigd): add debug options. E.g. we can set the input data to be # continuous natural numbers: # seq = np.arange(np.prod(np_shape)) # seq.resize(np_shape) # current_inputs_data.append(scale * seq.astype(np_dtype)) data = (scale * np.random.random_sample(np_shape)).astype(np_dtype) if run_params.is_v2: with ops.device("/GPU:0"): data = ops.convert_to_tensor(data) current_input_data.append(data) inputs_data.append(current_input_data) # Verify original graph. self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir, GraphState.ORIGINAL) # Run original graph without trt to get reference result. config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL) logging.info("Running original graph w/o trt, config:\n%s", str(config_no_trt)) ref_result = self._RunGraph(run_params, saved_model_dir, inputs_data, config_no_trt, GraphState.ORIGINAL) # Run calibration if necessary. if IsQuantizationWithCalibration(run_params): infer_saved_model_dir = self._GetCalibratedInferGraph( run_params, saved_model_dir, inputs_data) self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir, GraphState.INFERENCE) elif not run_params.convert_online: infer_saved_model_dir = self._GetInferGraph( run_params, saved_model_dir) self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir, GraphState.INFERENCE) else: infer_saved_model_dir = saved_model_dir # Run inference. infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE) logging.info("Running final inference graph, config:\n%s", str(infer_config)) result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data, infer_config, GraphState.INFERENCE) self.assertAllClose(ref_result, result, atol=self.ExpectedAbsoluteTolerance(run_params), rtol=self.ExpectedRelativeTolerance(run_params))
def stop_heartbeat(): logging.info('Stopping the heartbeat thread') _heartbeat_timer.set() # Give the threads some time to clean up. time.sleep(max(period // 10, 2))
def start(period: int) -> threading.Event: """Starts a persistent thread exchanging heartbeats between workers. Args: period: Heartbeat interval in seconds. Heartbeat timeout is set to the larger of `period` - 10 and 2s. Returns: A threading.Event object. Users can choose to call its set() method to shut down the heartbeat service gracefully. This isn't necessary in most cases, because the heartbeat service automatically shuts down at successful program exit through atexit handlers. But in situations when atexit handlers are not invoked, such as when multiprocessing processes exit in tests, users can manually request a shutdown. """ global _heartbeat_timer if _heartbeat_timer is not None: logging.warning( 'A heartbeat thread is already running, skipping this one.') return _heartbeat_timer task_id = api.client_id() num_tasks = api.num_clients() # Worker 0 generates a random token. All other workers receive that token. if task_id == 0: token = np.random.randint(0, pow(2, 16) - 1) # reserve the other 16 bits signal = np.full([num_tasks], token, dtype=np.int32) else: signal = np.zeros([num_tasks], dtype=np.int32) logging.info('Initial heartbeat signal: %s', signal) device = tf_device.DeviceSpec(job=api.job_name(), replica=0, task=task_id, device_type='CPU', device_index=0) # Always use 0 for group and instance keys to reduce unnecessary # collective hangs and simplify failure analysis. This also avoid # collision with normal collectives. with ops.device(device): signal = all_reduce(constant_op.constant(signal), group_size=num_tasks, group_key=0, instance_key=0, timeout=max(period - 10, 2)).numpy() logging.info('Merged heartbeat signal %s', signal) # The merged signal should have equal elements. If not, some worker(s) may be # out of sync, and we should terminate all workers. if task_id == 0: if not np.all(signal == token): logging.fatal('Merged heartbeat signal has value != %d', token) else: if len(set(signal)) != 1: logging.fatal('Merged heartbeat signal has unequal elements') token = signal[0] # On normal main process exit, set the timer to stop the heartbeat thread. _heartbeat_timer = threading.Event() def stop_heartbeat(): logging.info('Stopping the heartbeat thread') _heartbeat_timer.set() # Give the threads some time to clean up. time.sleep(max(period // 10, 2)) atexit.register(stop_heartbeat) # Start the persistent heartbeat thread. thread = threading.Thread( target=_heartbeat, args=[period, _heartbeat_timer, token, num_tasks, task_id, device], daemon=True) thread.start() return _heartbeat_timer
def _train_model(self, input_fn, hooks, saving_listeners): worker_hooks = [] with ops.Graph().as_default() as g, g.device(self._device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step_tensor = self._create_and_assert_global_step(g) training_util._get_or_create_global_step_read() # pylint: disable=protected-access features, labels, input_hooks = ( self._get_features_and_labels_from_input_fn( input_fn, model_fn_lib.ModeKeys.TRAIN)) worker_hooks.extend(input_hooks) estimator_spec = self._call_model_fn( features, labels, model_fn_lib.ModeKeys.TRAIN, self.config) if self._warm_start_settings: logging.info('Warm-starting with WarmStartSettings: %s' % (self._warm_start_settings,)) # pylint: disable=protected-access warm_starting_util._warm_start(self._warm_start_settings) # pylint: enable=protected-access # Check if the user created a loss summary, and add one if they didn't. # We assume here that the summary is called 'loss'. If it is not, we will # make another one with the name 'loss' to ensure it shows up in the right # graph in TensorBoard. if not any([x.op.name == 'loss' for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]): summary.scalar('loss', estimator_spec.loss) ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss) worker_hooks.extend(hooks) worker_hooks.extend([ training.NanTensorHook(estimator_spec.loss), training.LoggingTensorHook( { 'loss': estimator_spec.loss, 'step': global_step_tensor }, every_n_iter=100) ]) worker_hooks.extend(estimator_spec.training_hooks) if not (estimator_spec.scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)): ops.add_to_collection( ops.GraphKeys.SAVERS, training.Saver( sharded=True, max_to_keep=self._config.keep_checkpoint_max, keep_checkpoint_every_n_hours=( self._config.keep_checkpoint_every_n_hours), defer_build=True, save_relative_paths=True)) chief_hooks = [] all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks) saver_hooks = [ h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)] if (self._config.save_checkpoints_secs or self._config.save_checkpoints_steps): if not saver_hooks: chief_hooks = [ training.CheckpointSaverHook( self._model_dir, save_secs=self._config.save_checkpoints_secs, save_steps=self._config.save_checkpoints_steps, scaffold=estimator_spec.scaffold) ] saver_hooks = [chief_hooks[0]] if saving_listeners: if not saver_hooks: raise ValueError( 'There should be a CheckpointSaverHook to use saving_listeners. ' 'Please set one of the RunConfig.save_checkpoints_steps or ' 'RunConfig.save_checkpoints_secs.') else: # It is expected to have one CheckpointSaverHook. If multiple, we pick # up the first one to add listener. saver_hooks[0]._listeners.extend(saving_listeners) # pylint: disable=protected-access with training.MonitoredTrainingSession( master=self._config.master, is_chief=self._config.is_chief, checkpoint_dir=self._model_dir, scaffold=estimator_spec.scaffold, hooks=worker_hooks, chief_only_hooks=( tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)), save_checkpoint_secs=0, # Saving is handled by a hook. save_summaries_steps=self._config.save_summary_steps, config=self._session_config, log_step_count_steps=self._config.log_step_count_steps) as mon_sess: loss = None while not mon_sess.should_stop(): _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss]) return loss
def train(self, input_fn, hooks=None, steps=None, max_steps=None, saving_listeners=None): """Trains a model given training data input_fn. Args: input_fn: A function that provides input data for training as minibatches. See @{$get_started/premade_estimators#create_input_functions} for more information. The function should construct and return one of the following: * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a tuple (features, labels) with same constraints as below. * A tuple (features, labels): Where features is a `Tensor` or a dictionary of string feature name to `Tensor` and labels is a `Tensor` or a dictionary of string label name to `Tensor`. Both features and labels are consumed by `model_fn`. They should satisfy the expectation of `model_fn` from inputs. hooks: List of `SessionRunHook` subclass instances. Used for callbacks inside the training loop. steps: Number of steps for which to train model. If `None`, train forever or train until input_fn generates the `OutOfRange` error or `StopIteration` exception. 'steps' works incrementally. If you call two times train(steps=10) then training occurs in total 20 steps. If `OutOfRange` or `StopIteration` occurs in the middle, training stops before 20 steps. If you don't want to have incremental behavior please set `max_steps` instead. If set, `max_steps` must be `None`. max_steps: Number of total steps for which to train model. If `None`, train forever or train until input_fn generates the `OutOfRange` error or `StopIteration` exception. If set, `steps` must be `None`. If `OutOfRange` or `StopIteration` occurs in the middle, training stops before `max_steps` steps. Two calls to `train(steps=100)` means 200 training iterations. On the other hand, two calls to `train(max_steps=100)` means that the second call will not do any iteration since first call did all 100 steps. saving_listeners: list of `CheckpointSaverListener` objects. Used for callbacks that run immediately before or after checkpoint savings. Returns: `self`, for chaining. Raises: ValueError: If both `steps` and `max_steps` are not `None`. ValueError: If either `steps` or `max_steps` is <= 0. """ if (steps is not None) and (max_steps is not None): raise ValueError('Can not provide both steps and max_steps.') if steps is not None and steps <= 0: raise ValueError('Must specify steps > 0, given: {}'.format(steps)) if max_steps is not None and max_steps <= 0: raise ValueError( 'Must specify max_steps > 0, given: {}'.format(max_steps)) if max_steps is not None: start_step = _load_global_step_from_checkpoint_dir(self._model_dir) if max_steps <= start_step: logging.info('Skipping training since max_steps has already saved.') return self hooks = _check_hooks_type(hooks) hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps)) saving_listeners = _check_listeners_type(saving_listeners) loss = self._train_model(input_fn, hooks, saving_listeners) logging.info('Loss for final step: %s.', loss) return self
def Quantize(graph, is_training, weight_bits=8, activation_bits=8, ema_decay=0.999, quant_delay=None, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, scope=None): """Updates graph with quantization operations. Currently we quantize the following tensors: * Conv/MatMul: Quantize the weights if it matches. * Activation: Quantize the output if it matches. * Bypass/Post-activation Bypass: Quantize both input and output if it matches. Args: graph: Graph to modify. is_training: Whether quantizing training graph or eval graph. weight_bits: Number of bits to use for quantizing weights. activation_bits: Number of bits to use for quantizing activations. ema_decay: (Optional) Float, EMA decay parameter. EMA is used to update quantization intervals for quantizing activations (see here about EMA: https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average). quant_delay: (Optional, default None) Int, count of global steps for which to delay quantization. This helps weights stabilize at the start of training. vars_collection: (Optional) Collection where to store the variables for quantization interval ends. scope: The scope to be transformed. If it's not None, only the ops which are in this scope will be transformed. Raises: ValueError: When quantization fails. """ if scope and not scope.endswith('/'): scope += '/' input_to_ops_map = input_to_ops.InputToOps(graph) for layer_match in _FindLayersToQuantize(graph): # Quantize the weights. context = _GetContextFromOp(layer_match.layer_op) # If `scope` is given, only quantize it if the consumer of weights # (the layer op) is in the right scope. _InsertQuantOp(context, 'weights_quant', layer_match.weight_tensor.op, [layer_match.layer_op], is_training, moving_avg=False, ema_decay=ema_decay, quant_delay=quant_delay, narrow_range=True, vars_collection=vars_collection, bits=weight_bits, consumer_scope=scope) # Quantize the activations. consumer_ops = input_to_ops_map.ConsumerOperations( layer_match.activation_op) add_context = context if layer_match.bypass_op: add_context = re.search(r'^(.*)/([^/]+)', context).group(1) # If `scope` is given, only quantize it if the producer of weights # (usually it's the layer op) is in the right scope. _InsertQuantOp(add_context, 'act_quant', layer_match.activation_op, consumer_ops, is_training, moving_avg=True, ema_decay=ema_decay, quant_delay=quant_delay, vars_collection=vars_collection, bits=activation_bits, init_min=0.0, producer_scope=scope) # Quantize the inputs and output to the bypass (if it exists). The input to # the bypass is the bias add, and the output is the activation. if layer_match.bypass_op is not None: # If `scope` is given, only quantize it if the both the producer and the # consumer are in the right scope. _InsertQuantOp(context, 'conv_quant', layer_match.bias_add_op, [layer_match.bypass_op], is_training, moving_avg=True, ema_decay=ema_decay, quant_delay=quant_delay, vars_collection=vars_collection, bits=activation_bits, producer_scope=scope, consumer_scope=scope) # Make sure the op following this isn't an activation. In which case, we # shouldn't quantize it, since the activation will be Fused into the # Add at inference time. consumers = input_to_ops_map.ConsumerOperations( layer_match.bypass_op) if any( [consumer.type in _ACTIVATION_TYPES for consumer in consumers]): logging.info( 'Skipping %s, because its followed by an activation.', layer_match.bypass_op.name) else: _InsertQuantOp(add_context, 'add_quant', layer_match.bypass_op, input_to_ops_map.ConsumerOperations( layer_match.bypass_op), is_training, moving_avg=True, ema_decay=ema_decay, quant_delay=quant_delay, vars_collection=vars_collection, bits=activation_bits, producer_scope=scope, consumer_scope=scope) # Quantize bypass ops that occur after the activation. if layer_match.post_activation_bypass_op is not None: post_activation_bypass_context = re.search( r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1) # If `scope` is given, only quantize it if the producer is in the right # scope. # Make sure the op following this isn't an activation. In which case, we # shouldn't quantize it, since the activation will be Fused into the # Add at inference time. consumers = input_to_ops_map.ConsumerOperations( layer_match.post_activation_bypass_op) if any( [consumer.type in _ACTIVATION_TYPES for consumer in consumers]): logging.info( 'Skipping %s, because its followed by an activation.', layer_match.post_activation_bypass_op.name) else: _InsertQuantOp(post_activation_bypass_context, 'post_activation_bypass_quant', layer_match.post_activation_bypass_op, consumers, is_training, moving_avg=True, ema_decay=ema_decay, quant_delay=quant_delay, vars_collection=vars_collection, bits=activation_bits, producer_scope=scope)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError( "No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if (ops.executing_eagerly_outside_functions() and not getattr(self, "_local_or_standalone_client_mode", False)): context.context().configure_collective_ops( collective_leader=multi_worker_util.collective_leader( cluster_spec, task_type, task_id), scoped_allocator_enabled_ops=("CollectiveReduce", ), device_filters=("/job:%s/task:%d" % (task_type, task_id), )) self._collective_ops_configured = True # Starting a std server in eager mode and in independent worker mode. if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = copy.deepcopy(context.context().config) config_proto = self._update_config_proto(config_proto) if hasattr(cluster_resolver, "port"): port = cluster_resolver.port else: port = 0 server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc", port=port) context.context().enable_collective_ops(server_def) self._std_server_started = True # The `ensure_initialized` is needed before calling # `context.context().devices()`. context.context().ensure_initialized() logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices()) # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=local_devices, group_size=len(local_devices) * self._num_workers, collective_keys=self._collective_keys, communication=self._communication) # CrossDeviceOps for per host tensors. self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=[self._worker_device], group_size=self._num_workers, collective_keys=self._collective_keys, communication=cross_device_ops_lib.CollectiveCommunication.RING, ) super(CollectiveAllReduceExtended, self)._initialize_single_worker(local_devices) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() logging.info( "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, " "task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication)
def _InsertQuantOp(context, name, producer, consumers, is_training, moving_avg=True, init_min=-6.0, init_max=6.0, bits=8, ema_decay=0.999, quant_delay=None, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, narrow_range=False, producer_scope=None, consumer_scope=None): """Inserts a quant op between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. name: Name for the new quantization op within the context. producer: Producer operation of the pairs where quantization will be inserted. consumers: Consumer operations of the pairs. is_training: Whether quantizing training graph or eval graph. moving_avg: Specifies whether to use exponential moving average or just the last value seen. init_min: Starting minimum value for the new quantization op. init_max: Starting maximum value for the new quantization op. bits: Number of bits to use for quantization, must be between 2 and 8. ema_decay: (Optional) Float, EMA decay parameter. EMA is used to update quantization intervals for quantizing activations (see here about EMA: https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average). quant_delay: (Optional, default None) Int, count of global steps for which to delay quantization. This helps weights stabilize at the start of training. vars_collection: (Optional) Collection where to store the variables for quantization interval ends. narrow_range: Whether to use the narrow quantization range [1; 2^bits - 1] or wide range [0; 2^bits - 1]. producer_scope: The restriction of producer scope. If not None, the new op will be inserted only when the producer is in this scope. consumer_scope: The restriction of producer scope. If not None, the new op will be inserted only when all the consumers are in this scope. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ if producer_scope and not producer.name.startswith(producer_scope): logging.info( '_InsertQuantOp ignores context="%s" name="%s" ' 'because producer "%s" is not in scope "%s"', context, name, producer.name, producer_scope) return if consumer_scope: consumers_in_scope = [] for consumer in consumers: if consumer.name.startswith(consumer_scope): consumers_in_scope.append(consumer) else: logging.info( '_InsertQuantOp context="%s" name="%s" ignores ' 'consumer "%s" because it is not in scope "%s"', context, name, consumer.name, consumer_scope) return consumers = consumers_in_scope name_prefix = _AddContextToName(context, name) # This is needed on TPU where name_scope == 'TPUReplicate/loop', and # name_prefix starts with 'TPUReplicate/loop/'; without dropping it # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which # breaks things later. name_scope = ops.get_name_scope() if name_scope: name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/') inputs = producer.outputs[0] # Prevent ops from being quantized multiple times. Bypass ops can sometimes # overlap between multiple matches, so we need to ensure that we don't # add duplicate FakeQuant operations. if _FollowedByFakeQuant(inputs): return if moving_avg: quant = (quant_ops.MovingAvgQuantize(inputs, init_min=init_min, init_max=init_max, ema_decay=ema_decay, is_training=is_training, num_bits=bits, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) else: quant = (quant_ops.LastValueQuantize(inputs, init_min=init_min, init_max=init_max, is_training=is_training, num_bits=bits, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) if quant_delay and quant_delay > 0: activate_quant = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), quant_delay, name=name_prefix + '/activate_quant') quant = control_flow_ops.cond(activate_quant, lambda: quant, lambda: inputs, name=name_prefix + '/delayed_quant') if consumers: tensors_modified_count = common.RerouteTensor(quant, inputs, can_modify=consumers) # Some operations can have multiple output tensors going to the same # consumer. Since consumers is a set, we need to ensure that # tensors_modified_count is greater than or equal to the length of the set # of consumers. if tensors_modified_count < len(consumers): raise ValueError( 'No inputs quantized for ops: [%s]' % ', '.join([consumer.name for consumer in consumers]))
def __init__(self, model_dir=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_steps=_USE_DEFAULT, save_checkpoints_secs=_USE_DEFAULT, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, train_distribute=None, device_fn=None, protocol=None, eval_distribute=None, experimental_distribute=None, experimental_max_worker_delay_secs=None): """Constructs a RunConfig. All distributed training related properties `cluster_spec`, `is_chief`, `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and `task_type` are set based on the `TF_CONFIG` environment variable, if the pertinent information is present. The `TF_CONFIG` environment variable is a JSON object with attributes: `cluster` and `task`. `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to a list of task addresses. `task` has two attributes: `type` and `index`, where `type` can be any of the task types in `cluster`. When `TF_CONFIG` contains said information, the following properties are set on this class: * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If present, must have one and only one node in the `chief` attribute of `cluster_spec`. * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if `cluster_spec` is present; must be `worker` (the default value) if `cluster_spec` is not set. * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if `cluster_spec` is present; must be 0 (the default value) if `cluster_spec` is not set. * `master` is determined by looking up `task_type` and `task_id` in the `cluster_spec`. Defaults to ''. * `num_ps_replicas` is set by counting the number of nodes listed in the `ps` attribute of `cluster_spec`. Defaults to 0. * `num_worker_replicas` is set by counting the number of nodes listed in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1. * `is_chief` is determined based on `task_type` and `cluster`. There is a special node with `task_type` as `evaluator`, which is not part of the (training) `cluster_spec`. It handles the distributed evaluation job. Example of non-chief node: ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'worker', 'index': 1}}) config = RunConfig() assert config.master == 'host4:2222' assert config.task_id == 1 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 4 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'worker' assert not config.is_chief ``` Example of chief node: ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'chief', 'index': 0}}) config = RunConfig() assert config.master == 'host0:2222' assert config.task_id == 0 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 4 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'chief' assert config.is_chief ``` Example of evaluator node (evaluator is not part of training cluster): ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'evaluator', 'index': 0}}) config = RunConfig() assert config.master == '' assert config.evaluator_master == '' assert config.task_id == 0 assert config.num_ps_replicas == 0 assert config.num_worker_replicas == 0 assert config.cluster_spec == {} assert config.task_type == 'evaluator' assert not config.is_chief ``` N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set, `keep_checkpoint_max` might need to be adjusted accordingly, especially in distributed training. For example, setting `save_checkpoints_secs` as 60 without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation that checkpoint would be garbage collected after 5 minutes. In distributed training, the evaluation job starts asynchronously and might fail to load or find the checkpoint due to race condition. Args: model_dir: directory where model parameters, graph, etc are saved. If `PathLike` object, the path will be resolved. If `None`, will use a default value set by the Estimator. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. Defaults to 600 seconds if both `save_checkpoints_steps` and `save_checkpoints_secs` are not set in constructor. If both `save_checkpoints_steps` and `save_checkpoints_secs` are `None`, then checkpoints are disabled. session_config: a ConfigProto used to set session parameters, or `None`. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If `None` or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. log_step_count_steps: The frequency, in number of global steps, that the global step and the loss will be logged during training. Also controls the frequency that the global steps / s will be logged (and written to summary) during training. train_distribute: An optional instance of `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during training, according to the policy specified by that strategy. Setting `experimental_distribute.train_distribute` is preferred. device_fn: A callable invoked for every `Operation` that takes the `Operation` and returns the device string. If `None`, defaults to the device function returned by `tf.train.replica_device_setter` with round-robin strategy. protocol: An optional argument which specifies the protocol used when starting server. `None` means default to grpc. eval_distribute: An optional instance of `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during evaluation, according to the policy specified by that strategy. Setting `experimental_distribute.eval_distribute` is preferred. experimental_distribute: An optional `tf.contrib.distribute.DistributeConfig` object specifying DistributionStrategy-related configuration. The `train_distribute` and `eval_distribute` can be passed as parameters to `RunConfig` or set in `experimental_distribute` but not both. experimental_max_worker_delay_secs: An optional integer specifying the maximum time a worker should wait before starting. By default, workers are started at staggered times, with each worker being delayed by up to 60 seconds. This is intended to reduce the risk of divergence, which can occur when many workers simultaneously update the weights of a randomly initialized model. Users who warm-start their models and train them for short durations (a few minutes or less) should consider reducing this default to improve training times. Raises: ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs` are set. """ if (save_checkpoints_steps == _USE_DEFAULT and save_checkpoints_secs == _USE_DEFAULT): save_checkpoints_steps = None save_checkpoints_secs = 600 elif save_checkpoints_secs == _USE_DEFAULT: save_checkpoints_secs = None elif save_checkpoints_steps == _USE_DEFAULT: save_checkpoints_steps = None elif (save_checkpoints_steps is not None and save_checkpoints_secs is not None): raise ValueError(_SAVE_CKPT_ERR) tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}')) if tf_config: logging.info('TF_CONFIG environment variable: %s', tf_config) model_dir = _get_model_dir(tf_config, compat_internal.path_to_str(model_dir)) RunConfig._replace( self, allowed_properties_list=_DEFAULT_REPLACEABLE_LIST, model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=session_config, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, train_distribute=train_distribute, device_fn=device_fn, protocol=protocol, eval_distribute=eval_distribute, experimental_distribute=experimental_distribute, experimental_max_worker_delay_secs= experimental_max_worker_delay_secs) # TODO(frankchn,priyag): Eventually use distributed coordinator for TPUs. if ((train_distribute and train_distribute.__class__.__name__ != 'TPUStrategy') or (eval_distribute and eval_distribute.__class__.__name__ != 'TPUStrategy') or experimental_distribute): logging.info( 'Initializing RunConfig with distribution strategies.') distribute_coordinator_training.init_run_config(self, tf_config) else: self._init_distributed_setting_from_environment_var(tf_config) self._maybe_overwrite_session_config_for_distributed_training()
def __init__(self, dim=1000, num_ops=100, virtual_devices_per_gpu=None, device_probabilities=None): self._dim = dim self._num_ops = num_ops if virtual_devices_per_gpu is None: self._virtual_devices_per_gpu = [3] else: self._virtual_devices_per_gpu = virtual_devices_per_gpu self._visible_device_list = [ i for i in range(len(self._virtual_devices_per_gpu)) ] gpu_devices = [('/gpu:' + str(i)) for i in range(sum(self._virtual_devices_per_gpu))] self.devices = ['/cpu:0'] + gpu_devices self._num_devices = len(self.devices) # Each virtual device gets 2GB memory. self._mem_limits_mb = [([1 << 11] * i) for i in self._virtual_devices_per_gpu] self.config = self._GetSessionConfig() if device_probabilities is not None: self._device_probabilities = list( device_probabilities) # Deep copy for i in range(1, self._num_devices): self._device_probabilities[i] += self._device_probabilities[i - 1] else: # Each device gets same probability to be assigned an operation. step = 1.0 / self._num_devices self._device_probabilities = [(x + 1) * step for x in range(self._num_devices)] # To prevent rounding error causing problems. self._device_probabilities[self._num_devices - 1] = 1.1 logging.info('dim: %d', self._dim) logging.info('num_ops: %d', self._num_ops) logging.info('visible_device_list: %s', str(self._visible_device_list)) logging.info('virtual_devices_per_gpu: %s', str(self._virtual_devices_per_gpu)) logging.info('mem_limits: %s', str(self._mem_limits_mb)) logging.info('devices: %s', str(self.devices)) logging.info('config: %s', text_format.MessageToString(self.config)) logging.info('device_probabilities: %s', str(self._device_probabilities))
def func(): logging.info('func running')
def _initialize_local(self, cluster_resolver, devices=None): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 if ops.executing_eagerly_outside_functions(): try: context.context().configure_collective_ops( scoped_allocator_enabled_ops=("CollectiveReduce", )) except RuntimeError: logging.warning( "Collective ops is not configured at program startup. " "Some performance features may not be enabled.") self._collective_ops_configured = True # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if devices: local_devices = devices else: if num_gpus: local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus)) else: local_devices = ("/device:CPU:0", ) self._worker_device = device_util.canonicalize("/device:CPU:0") self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) self._collective_keys = cross_device_utils.CollectiveKeys() self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=local_devices, group_size=len(local_devices), collective_keys=self._collective_keys, communication=self._communication) # CrossDeviceOps for per host tensors. self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=[self._worker_device], group_size=self._num_workers, collective_keys=self._collective_keys, communication=cross_device_ops_lib.CollectiveCommunication.RING, ) super(CollectiveAllReduceExtended, self)._initialize_single_worker(local_devices) self._cluster_spec = None self._task_type = None self._task_id = None # This is a mark to tell whether we are running with standalone client or # independent worker. Right now with standalone client, strategy object is # created as local strategy and then turn into multi-worker strategy via # configure call. self._local_or_standalone_client_mode = True # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() logging.info( "Single-worker MultiWorkerMirroredStrategy with local_devices " "= %r, communication = %s", local_devices, self._communication)
def train_step(sess, train_op, global_step, train_step_kwargs): """Function that takes a gradient step and specifies whether to stop. Args: sess: The current session. train_op: An `Operation` that evaluates the gradients and returns the total loss. global_step: A `Tensor` representing the global training step. train_step_kwargs: A dictionary of keyword arguments. Returns: The total loss and a boolean indicating whether or not to stop training. Raises: ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not. """ start_time = time.time() trace_run_options = None run_metadata = None if 'should_trace' in train_step_kwargs: if 'logdir' not in train_step_kwargs: raise ValueError( 'logdir must be present in train_step_kwargs when ' 'should_trace is present') if sess.run(train_step_kwargs['should_trace']): trace_run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() total_loss, np_global_step = sess.run([train_op, global_step], options=trace_run_options, run_metadata=run_metadata) time_elapsed = time.time() - start_time if run_metadata is not None: tl = timeline.Timeline(run_metadata.step_stats) trace = tl.generate_chrome_trace_format() trace_filename = os.path.join(train_step_kwargs['logdir'], 'tf_trace-%d.json' % np_global_step) logging.info('Writing trace to %s', trace_filename) file_io.write_string_to_file(trace_filename, trace) if 'summary_writer' in train_step_kwargs: train_step_kwargs['summary_writer'].add_run_metadata( run_metadata, 'run_metadata-%d' % np_global_step) if 'should_log' in train_step_kwargs: if sess.run(train_step_kwargs['should_log']): logging.info('global step %d: loss = %.4f (%.3f sec/step)', np_global_step, total_loss, time_elapsed) # TODO(nsilberman): figure out why we can't put this into sess.run. The # issue right now is that the stop check depends on the global step. The # increment of global step often happens via the train op, which used # created using optimizer.apply_gradients. # # Since running `train_op` causes the global step to be incremented, one # would expected that using a control dependency would allow the # should_stop check to be run in the same session.run call: # # with ops.control_dependencies([train_op]): # should_stop_op = ... # # However, this actually seems not to work on certain platforms. if 'should_stop' in train_step_kwargs: should_stop = sess.run(train_step_kwargs['should_stop']) else: should_stop = False return total_loss, should_stop
def convert_variables_to_constants_v2(func): """Replaces all the variables in a graph with constants of the same values. TensorFlow 2.0 function for converting all Variable ops into Const ops holding the same values. This makes it possible to describe the network fully with a single GraphDef file, and allows the removal of a lot of ops related to loading and saving the variables. This function runs Grappler's function inlining optimization in order to return a single subgraph. The current implementation only works for graphs that do not contain any control flow or embedding related ops. Args: func: ConcreteFunction. Returns: GraphDef containing a simplified version of the original. """ # TODO(nupurgarg): Replace ResourceGather with Gather. # TODO(nupurgarg): Change attr for Variables in control flow and functions. graph_def = _run_inline_graph_optimization(func) # Identify the ReadVariableOps. get_name = lambda name: name.split(":")[0] map_name_to_node = {get_name(node.name): node for node in graph_def.node} # TODO(b/125838789): Use `func.graph.captures`. # Get mapping from input name to variable value. tensor_data = {} input_tensors = func.inputs[-len(func.captured_inputs):] for var in func.graph.variables: index = func.captured_inputs.index(var.handle) tensor = input_tensors[index] tensor_data[get_name(tensor.name)] = var.numpy() resource_identities = {} resource_placeholders = {} for node in graph_def.node: if node.op == "ReadVariableOp": # Get name of Placeholder op associated with ReadVariableOp. There can be # an Identity in between the ReadVariableOp and Placeholder. Store the # Identity ops with the associated dtypes. input_name = get_name(node.input[0]) while map_name_to_node[input_name].op == "Identity": resource_identities[input_name] = node.attr["dtype"] input_name = get_name(map_name_to_node[input_name].input[0]) if map_name_to_node[input_name].op != "Placeholder": raise ValueError( "Cannot find the Placeholder op that is an input " "to the ReadVariableOp.") # Build a map of Placeholder ops that are inputs to ReadVariableOps to the # variable's dtype and data. resource_placeholders[input_name] = { "dtype": node.attr["dtype"], "data": tensor_data[input_name], } # Reconstruct the graph with constants in place of variables. output_graph_def = graph_pb2.GraphDef() how_many_converted = 0 for input_node in graph_def.node: output_node = output_graph_def.node.add() # Convert Placeholder ops that are inputs to ReadVariableOps into Const ops. if input_node.name in resource_placeholders: dtype = resource_placeholders[input_node.name]["dtype"] data = resource_placeholders[input_node.name]["data"] output_node.op = "Const" output_node.name = input_node.name output_node.attr["dtype"].CopyFrom(dtype) output_node.attr["value"].tensor.CopyFrom( tensor_util.make_tensor_proto(data, dtype=dtype.type, shape=data.shape)) how_many_converted += 1 # Change the dtype for Identity ops that are inputs to ReadVariableOps. elif input_node.name in resource_identities: output_node.CopyFrom(input_node) output_node.attr["T"].CopyFrom( resource_identities[input_node.name]) # Convert ReadVariableOps into Identity ops. elif input_node.op == "ReadVariableOp": output_node.op = "Identity" output_node.name = input_node.name output_node.input.extend([input_node.input[0]]) output_node.attr["T"].CopyFrom(input_node.attr["dtype"]) if "_class" in input_node.attr: output_node.attr["_class"].CopyFrom(input_node.attr["_class"]) else: output_node.CopyFrom(input_node) logging.info("Converted %d variables to const ops.", how_many_converted) return output_graph_def
def after_run(self, run_context, run_values): evals_completed = run_values.results['evals_completed'] if self._log_progress: logging.info('Evaluation [%d/%d]', evals_completed, self._num_evals) if evals_completed >= self._num_evals: run_context.request_stop()
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training, as measured by 'global_step': training will stop if global_step is greater than 'number_of_steps'. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of them. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): sync_optimizer = [sync_optimizer] if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() if sync_optimizer is not None: for opt in sync_optimizer: if not isinstance( opt, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), lookup_ops.tables_initializer()) if sync_optimizer is not None and isinstance(sync_optimizer, list): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = control_flow_ops.group( *[opt.chief_init_op for opt in sync_optimizer]) else: local_init_op = control_flow_ops.group( * [opt.local_step_init_op for opt in sync_optimizer]) ready_for_local_init_op = control_flow_ops.group( *[opt.ready_for_local_init_op for opt in sync_optimizer]) else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = [ opt.get_init_tokens_op() for opt in sync_optimizer ] chief_queue_runner = [ opt.get_chief_queue_runner() for opt in sync_optimizer ] if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) threads = sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, chief_queue_runner) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') sv.request_stop() break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) sv.stop(threads, close_summary_writer=True) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def print_hparams(self): logging.info(self._spec.to_json())
def _evaluate_once(checkpoint_path, master='', scaffold=None, eval_ops=None, feed_dict=None, final_ops=None, final_ops_feed_dict=None, hooks=None, config=None): """Evaluates the model at the given checkpoint path. During a single evaluation, the `eval_ops` is run until the session is interrupted or requested to finish. This is typically requested via a `tf.contrib.training.StopAfterNEvalsHook` which results in `eval_ops` running the requested number of times. Optionally, a user can pass in `final_ops`, a single `Tensor`, a list of `Tensors` or a dictionary from names to `Tensors`. The `final_ops` is evaluated a single time after `eval_ops` has finished running and the fetched values of `final_ops` are returned. If `final_ops` is left as `None`, then `None` is returned. One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record summaries after the `eval_ops` have run. If `eval_ops` is `None`, the summaries run immediately after the model checkpoint has been restored. Note that `evaluate_once` creates a local variable used to track the number of evaluations run via `tf.contrib.training.get_or_create_eval_step`. Consequently, if a custom local init op is provided via a `scaffold`, the caller should ensure that the local init op also initializes the eval step. Args: checkpoint_path: The path to a checkpoint to use for evaluation. master: The BNS address of the TensorFlow master. scaffold: An tf.train.Scaffold instance for initializing variables and restoring variables. Note that `scaffold.init_fn` is used by the function to restore the checkpoint. If you supply a custom init_fn, then it must also take care of restoring the model from its checkpoint. eval_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names to `Tensors`, which is run until the session is requested to stop, commonly done by a `tf.contrib.training.StopAfterNEvalsHook`. feed_dict: The feed dictionary to use when executing the `eval_ops`. final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names to `Tensors`. final_ops_feed_dict: A feed dictionary to use when evaluating `final_ops`. hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the evaluation loop. config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The fetched values of `final_ops` or `None` if `final_ops` is `None`. """ eval_step = _get_or_create_eval_step() # Prepare the run hooks. hooks = list(hooks or []) if eval_ops is not None: update_eval_step = state_ops.assign_add(eval_step, 1, use_locking=True) if isinstance(eval_ops, dict): eval_ops['update_eval_step'] = update_eval_step elif isinstance(eval_ops, (tuple, list)): eval_ops = list(eval_ops) + [update_eval_step] else: eval_ops = [eval_ops, update_eval_step] eval_step_value = _get_latest_eval_step_value(eval_ops) for h in hooks: if isinstance(h, _StopAfterNEvalsHook): h._set_evals_completed_tensor(eval_step_value) # pylint: disable=protected-access logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) # Prepare the session creator. session_creator = monitored_session.ChiefSessionCreator( scaffold=scaffold, checkpoint_filename_with_path=checkpoint_path, master=master, config=config) final_ops_hook = basic_session_run_hooks.FinalOpsHook( final_ops, final_ops_feed_dict) hooks.append(final_ops_hook) with monitored_session.MonitoredSession( session_creator=session_creator, hooks=hooks) as session: if eval_ops is not None: while not session.should_stop(): session.run(eval_ops, feed_dict) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_ops_hook.final_ops_values
def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir): """Train or evaluate the model. Args: is_training: whether to train or evaluate the model. In training mode, quantization will be simulated where the quantize_and_dequantize_v2 are placed. use_trt: if true, use TRT INT8 mode for evaluation, which will perform real quantization. Otherwise use native TensorFlow which will perform simulated quantization. Ignored if is_training is True. batch_size: batch size. num_epochs: how many epochs to train. Ignored if is_training is False. model_dir: where to save or load checkpoint. Returns: The Estimator evaluation result. """ # Get dataset train_data, test_data = mnist.load_data() def _PreprocessFn(x, y): x = math_ops.cast(x, dtypes.float32) x = array_ops.expand_dims(x, axis=2) x = 2.0 * (x / 255.0) - 1.0 y = math_ops.cast(y, dtypes.int32) return x, y def _EvalInputFn(): mnist_x, mnist_y = test_data dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) dataset = dataset.apply( data.experimental.map_and_batch( map_func=_PreprocessFn, batch_size=batch_size, num_parallel_calls=8)) dataset = dataset.repeat(count=1) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels def _TrainInputFn(): mnist_x, mnist_y = train_data dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) dataset = dataset.shuffle(2 * len(mnist_x)) dataset = dataset.apply( data.experimental.map_and_batch( map_func=_PreprocessFn, batch_size=batch_size, num_parallel_calls=8)) dataset = dataset.repeat(count=num_epochs) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels def _ModelFn(features, labels, mode): if is_training: logits_out = self._BuildGraph(features) else: graph_def = self._GetGraphDef(use_trt, batch_size, model_dir) logits_out = importer.import_graph_def( graph_def, input_map={INPUT_NODE_NAME: features}, return_elements=[OUTPUT_NODE_NAME + ':0'], name='')[0] loss = losses.sparse_softmax_cross_entropy( labels=labels, logits=logits_out) summary.scalar('loss', loss) classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out') accuracy = metrics.accuracy( labels=labels, predictions=classes_out, name='acc_op') summary.scalar('accuracy', accuracy[1]) if mode == ModeKeys.EVAL: return EstimatorSpec( mode, loss=loss, eval_metric_ops={'accuracy': accuracy}) elif mode == ModeKeys.TRAIN: optimizer = AdamOptimizer(learning_rate=1e-2) train_op = optimizer.minimize(loss, global_step=get_global_step()) return EstimatorSpec(mode, loss=loss, train_op=train_op) config_proto = config_pb2.ConfigProto() config_proto.gpu_options.allow_growth = True estimator = Estimator( model_fn=_ModelFn, model_dir=model_dir if is_training else None, config=RunConfig(session_config=config_proto)) if is_training: estimator.train(_TrainInputFn) results = estimator.evaluate(_EvalInputFn) logging.info('accuracy: %s', str(results['accuracy'])) return results
def main(dataset_dir, log_dir, tfrecord_filename): #State the location of the checkpoint file is checkpoint_file = 'init_ckpt/inception_resnet_v2.ckpt' #State the labels file and read it labels_file = dataset_dir+'/labels.txt' labels = open(labels_file, 'r') #Create a dictionary to refer each label to their string name labels_to_name = {} for line in labels: label, string_name = line.split(':') string_name = string_name[:-1] #Remove newline labels_to_name[int(label)] = string_name #Create the file pattern of your TFRecord files so that it could be recognized later on file_pattern = tfrecord_filename + '_%s_*.tfrecord' #================= TRAINING INFORMATION ================== #State the number of epochs to train num_epochs = 2 #State your batch size batch_size = 4 #Learning rate information and configuration (Up to you to experiment) initial_learning_rate = 0.001 learning_rate_decay_factor = 0.5 num_epochs_before_decay = 1 #Create the log directory here. Must be done here otherwise import will activate this unneededly. if not os.path.exists(log_dir): os.mkdir(log_dir) # session=tf.Session() #Training the model #we start by constructing the graph and then build the model with tf.Graph().as_default(): tf.logging.set_verbosity(tf.logging.INFO) #Set the verbosity to INFO level #First create the dataset and load one batch dataset = get_split('train', dataset_dir, file_pattern=file_pattern, tfrecord_filename=tfrecord_filename) images, _, labels = load_batch(dataset, height=image_size, width=image_size, batch_size=batch_size) #Know the number steps to take before decaying the learning rate and batches per epoch and Because one step is one batch processed step per epoch=step per batch num_batches_per_epoch = dataset.num_samples / batch_size num_steps_per_epoch = num_batches_per_epoch decay_steps = int(num_epochs_before_decay * num_steps_per_epoch) #Create the model inference with slim.arg_scope(inception_resnet_v2_arg_scope()): logits, end_points = inception_resnet_v2(images, num_classes = dataset.num_classes, is_training = True) #Define the scopes that you want to exclude for restoration exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits'] variables_to_restore = slim.get_variables_to_restore(exclude = exclude) #Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!) one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes) """# To make the model better at classifying the input images, we must somehow change the variables for all the network layers. To do this we first need to know how well the model currently performs by comparing the predicted output of the model `y_pred` to the desired output `y_true` The cross-entropy is a performance measure used in classification. The cross-entropy is a continuous function that is always positive and if the predicted output of the model exactly matches the desired output then the cross-entropy equals zero. The goal of optimization is therefore to minimize the cross-entropy so it gets as close to zero as possible by changing the variables of the network layers. Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks this is achieved by computing the loss, applies the gradients in order to update the weight and later return a tensor that when evaluated returns the total loss """ loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits) total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well #Create the global step for monitoring the learning_rate and training. global_step = get_or_create_global_step() #Define your exponentially decaying learning rate lr = tf.train.exponential_decay( learning_rate = initial_learning_rate, global_step = global_step, decay_steps = decay_steps, decay_rate = learning_rate_decay_factor, staircase = True) #Now we can define the optimizer that takes on the learning rate optimizer = tf.train.AdamOptimizer(learning_rate = lr) """ Create the train_op, Computation of the loss and gradient """ train_op = slim.learning.create_train_op(total_loss, optimizer) #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. predictions = tf.argmax(end_points['Predictions'], 1) probabilities = end_points['Predictions'] accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels) metrics_op = tf.group(accuracy_update, probabilities) my_summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) for end_point in end_points: x = end_points[end_point] my_summaries.add(tf.summary.histogram('activation/' + end_point, x)) #Now finally create all the summaries you need to monitor and group them into one summary op. my_summaries.add(tf.summary.scalar('losses/Total_Loss', total_loss)) my_summaries.add(tf.summary.scalar('accuracy', accuracy)) my_summaries.add(tf.summary.scalar('learning_rate', lr)) my_summary_op = tf.summary.merge(list(my_summaries)) #session.run(tf.global_variables_initializer()) #Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently. def train_step(sess, train_op, global_step): """ Runs a session for the three arguments provided and gives a logging on the time elapsed for each global step """ #Check the time for each sess run start_time = time.time() total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op]) time_elapsed = time.time() - start_time #Run the logging to print some results logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed) return total_loss, global_step_count #Now we create a saver function that actually restores the variables from a checkpoint file in a sess saver = tf.train.Saver(variables_to_restore) def restore_fn(sess): return saver.restore(sess, checkpoint_file) #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir = log_dir, summary_op = None, init_fn = restore_fn) #Run the managed session with sv.managed_session() as sess: for step in range(int(num_steps_per_epoch * num_epochs)): # for step in xrange(1): #At the start of every epoch, show the vital information: if step % num_batches_per_epoch == 0: logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, num_epochs) learning_rate_value, accuracy_value = sess.run([lr, accuracy]) logging.info('Current Learning Rate: %s', learning_rate_value) logging.info('Current Streaming Accuracy: %s', accuracy_value) # optionally, print your logits and predictions for a sanity check that things are going fine. logits_value, probabilities_value, predictions_value, labels_value = sess.run([logits, probabilities, predictions, labels]) print('logits: \n', logits_value) print('Probabilities: \n', probabilities_value) print('predictions: \n', predictions_value) print('Labels:\n:', labels_value) #Log the summaries every 10 step. if step % 10 == 0: loss, _ = train_step(sess, train_op, sv.global_step) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) #If not, simply run the training step else: loss, _ = train_step(sess, train_op, sv.global_step) #We log the final training loss and accuracy logging.info('Final Loss: %s', loss) logging.info('Final Accuracy: %s', sess.run(accuracy)) #Once all the training has been done, save the log files and checkpoint model logging.info('Finished training! Saving model to disk now.') # saver.save(sess, "model.ckpt") sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
def __call__(self, run_context, all_workers, lame_workers): del run_context, lame_workers all_workers.shutdown(exit_code=42) logging.info('Resetting coordinator.') raise CoordinatorResetError()
def main(): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) parameters = {} parameters['use_character_lstm'] = True parameters['character_embedding_dimension'] = 25 parameters['token_embedding_dimension'] = 100 parameters['token_pretrained_embedding_filepath'] = '' parameters['pretrained_model_checkpoint_filepath'] = OutputPath( 'char_model_{0:05d}.ckpt'.format(2)) parameters['character_lstm_hidden_state_dimension'] = 25 parameters['token_lstm_hidden_state_dimension'] = 100 parameters['use_crf'] = True parameters['optimizer'] = 'adam' parameters['learning_rate'] = 0.005 parameters['gradient_clipping_value'] = 5.0 parameters['dropout_rate'] = 0.2 parameters['maximum_number_of_epochs'] = 10 loading_time = time.time() test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu' wordMapPath = 'word_map' tagMapPath = 'tag_map' charMapPath = 'char_map' word_map = readMap(wordMapPath) tag_map = readMap(tagMapPath) char_map = readMap(charMapPath) test_data = Dataset(test_data_path) test_data.load_dataset(word_map, tag_map, char_map) sess = tf.Session() with sess.as_default(): model = EntityLSTM(test_data, parameters) sess.run(tf.global_variables_initializer()) model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs']) model_saver.restore(sess, parameters['pretrained_model_checkpoint_filepath']) total_token_num = 0 correct_token_num = 0 start = time.time() transition_params_trained = sess.run(model.transition_parameters) start = time.time() while test_data.has_next_sent(): sent = test_data.get_next_sent() feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.tag_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 - parameters['dropout_rate'] } unary_scores, predictions = sess.run( [model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params_trained) predictions = predictions[1:-1] else: predictions = predictions.tolist() gold_labels = sent.tag_ids total_token_num += len(predictions) for idx, p in enumerate(predictions): if p == gold_labels[idx]: correct_token_num += 1 logging.info('token number is %d, accuracy is %.2f%%, time is %.2f', total_token_num, (100.0 * correct_token_num / total_token_num), time.time() - start)
def _train_model(self, input_fn, steps, feed_fn=None, init_op=None, init_feed_fn=None, init_fn=None, device_fn=None, monitors=None, log_every_steps=100, fail_on_nan_loss=True): if self._config.execution_mode not in ('all', 'train'): return # Stagger startup of worker sessions based on task id. sleep_secs = min( self._config.training_worker_max_startup_secs, self._config.task * self._config.training_worker_session_startup_stagger_secs) if sleep_secs: logging.info('Waiting %d secs before starting task %d.', sleep_secs, self._config.task) time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn self._graph = ops.Graph() with self._graph.as_default() as g, g.device(device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, targets = input_fn() self._check_inputs(features, targets) train_op, loss_op = self._get_train_ops(features, targets) # Add default monitors. if monitors is None: monitors = [] monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=100) is_chief = self._config.task == 0 if not is_chief: # Run monitors only on chief. monitors = [] # Setup monitors. for monitor in monitors: monitor.set_estimator(self) return train(graph=g, output_dir=self._model_dir, train_op=train_op, loss_op=loss_op, global_step_tensor=global_step, init_op=init_op, init_feed_dict=init_feed_fn() if init_feed_fn is not None else None, init_fn=init_fn, log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, feed_fn=feed_fn, max_steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors)
def _init_distributed_setting_from_environment_var(self): """Initialize distributed properties based on environment variable.""" tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV) or '{}') if tf_config: logging.info('TF_CONFIG environment variable: %s', tf_config) self._cluster_spec = server_lib.ClusterSpec( tf_config.get(_CLUSTER_KEY, {})) task_env = tf_config.get(_TASK_ENV_KEY, {}) if self._cluster_spec: # Distributed mode. if TaskType.CHIEF not in self._cluster_spec.jobs: raise ValueError( 'If "cluster" is set in TF_CONFIG, it must have one "chief" node.' ) if len(self._cluster_spec.job_tasks(TaskType.CHIEF)) > 1: raise ValueError( 'The "cluster" in TF_CONFIG must have only one "chief" node.' ) self._task_type = task_env.get(_TASK_TYPE_KEY, None) task_id = task_env.get(_TASK_ID_KEY, None) if not self._task_type: raise ValueError( 'If "cluster" is set in TF_CONFIG, task type must be set.') if task_id is None: raise ValueError( 'If "cluster" is set in TF_CONFIG, task index must be set.' ) self._task_id = int(task_id) # Check the task id bounds. Upper bound is not necessary as # - for evaluator, there is no upper bound. # - for non-evaluator, task id is upper bounded by the number of jobs in # cluster spec, which will be checked later (when retrieving the `master`) if self._task_id < 0: raise ValueError('Task index must be non-negative number.') if self._task_type != TaskType.EVALUATOR: self._master = _get_master(self._cluster_spec, self._task_type, self._task_id) self._num_ps_replicas = _count_ps(self._cluster_spec) self._num_worker_replicas = _count_worker(self._cluster_spec) else: # Evaluator is not part of the training cluster. self._cluster_spec = server_lib.ClusterSpec({}) self._master = _LOCAL_MASTER self._num_ps_replicas = 0 self._num_worker_replicas = 0 self._is_chief = self._task_type == TaskType.CHIEF else: # Local mode. self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER) self._task_id = int(task_env.get(_TASK_ID_KEY, 0)) if self._task_type != TaskType.WORKER: raise ValueError( 'If "cluster" is not set in TF_CONFIG, task type must be WORKER.' ) if self._task_id != 0: raise ValueError( 'If "cluster" is not set in TF_CONFIG, task index must be 0.' ) self._master = '' self._is_chief = True self._num_ps_replicas = 0 self._num_worker_replicas = 1
def build_training(self, handle, component_weights=None, unroll_using_oracle=None, max_index=-1): """Builds a training pipeline. Args: handle: Handle tensor for the ComputeSession. component_weights: If set, this is a list of relative weights each component's cost should get in the pipeline. Defaults to 1.0 for each component. unroll_using_oracle: If set, this is a list of booleans indicating whether or not to use the gold decodings for each component. Defaults to True for each component. max_index: Training will use only the first max_index components, or -1 for all components. Returns: handle: to the ComputeSession, conditioned on completing training step. outputs: a dictionary of useful training tensors. Raises: IndexError: if max_index is positive but out of bounds. """ self.read_from_avg = False if max_index < 0: max_index = len(self.components) else: if not 0 < max_index <= len(self.components): raise IndexError( 'Invalid max_index {} for components {}; handle {}'.format( max_index, self.component_names, handle.name)) # By default, we train every component supervised. if not component_weights: component_weights = [1] * max_index if not unroll_using_oracle: unroll_using_oracle = [True] * max_index component_weights = component_weights[:max_index] total_weight = (float)(sum(component_weights)) component_weights = [w / total_weight for w in component_weights] unroll_using_oracle = unroll_using_oracle[:max_index] logging.info('Creating training target:') logging.info('\tWeights: %s', component_weights) logging.info('\tOracle: %s', unroll_using_oracle) metrics_list = [] cost = tf.constant(0.) effective_batch = tf.constant(0) avg_ops = [] params_to_train = [] network_states = {} for component_index in range(0, max_index): comp = self.components[component_index] network_states[comp.name] = component.NetworkState() logging.info('Initializing data for component "%s"', comp.name) handle = dragnn_ops.init_component_data( handle, component=comp.name, clear_existing_annotations=False) # TODO(googleuser): Phase out component.MasterState. master_state = component.MasterState( handle, dragnn_ops.batch_size(handle, component=comp.name)) with tf.control_dependencies([handle, cost]): args = (master_state, network_states) if unroll_using_oracle[component_index]: handle, component_cost, correct, total = comp.build_training( *args) else: handle = comp.build_inference(*args, during_training=True) component_cost = tf.constant(0.) correct, total = tf.constant(0), tf.constant(0) weighted_component_cost = tf.multiply( component_cost, tf.constant((float)(component_weights[component_index])), name='weighted_component_cost') cost += weighted_component_cost effective_batch += total metrics_list += [[total], [correct]] with tf.control_dependencies([comp.advance_counters(total)]): cost = tf.identity(cost) # Keep track of which parameters will be trained, and any moving # average updates to apply for these parameters. params_to_train += comp.network.params if self.hyperparams.use_moving_average: avg_ops += comp.avg_ops # Concatenate evaluation results metrics = tf.concat(metrics_list, 0) # Now that the cost is computed: # 1. compute the gradients, # 2. add an optimizer to update the parameters using the gradients, # 3. make the ComputeSession handle depend on the optimizer. grads_and_vars = self.optimizer.compute_gradients( cost, var_list=params_to_train) clipped_gradients = [(self._clip_gradients(g), v) for g, v in grads_and_vars] minimize_op = self.optimizer.apply_gradients( clipped_gradients, global_step=self.master_vars['step']) if self.hyperparams.use_moving_average: with tf.control_dependencies([minimize_op]): minimize_op = tf.group(*avg_ops) # Make sure all the side-effectful minimizations ops finish before # proceeding. with tf.control_dependencies([minimize_op]): handle = tf.identity(handle) # Restore that subsequent builds don't use average by default. self.read_from_avg = False # Returns named access to common outputs. outputs = { 'cost': cost, 'batch': effective_batch, 'metrics': metrics, } return handle, outputs
def stop(self): logging.info('Stopping worker watchdog.') self._reset_manager(stopping=True) self._running = False self.join()
# ============================================================================== """Builds a DRAGNN graph for local training.""" import tensorflow as tf from tensorflow.core.protobuf import saver_pb2 from tensorflow.python.platform import tf_logging as logging from dragnn.protos import spec_pb2 from dragnn.python import component from dragnn.python import dragnn_ops from dragnn.python import check try: tf.NotDifferentiable('ExtractFixedFeatures') except KeyError, e: logging.info(str(e)) def _create_learning_rate(hyperparams, step_var): """Creates learning rate var, with decay and switching for CompositeOptimizer. Args: hyperparams: a GridPoint proto containing optimizer spec, particularly learning_method to determine optimizer class to use. step_var: tf.Variable, global training step. Returns: a scalar `Tensor`, the learning rate based on current step and hyperparams. """ base_rate = hyperparams.learning_rate return tf.train.exponential_decay(base_rate,
def train_helper(self, input_window_size, loss, max_loss=None, train_steps=200, anomaly_prob=0.01, anomaly_distribution=None, multiple_periods=False): np.random.seed(3) data_noise_stddev = 0.2 if max_loss is None: if loss == ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS: max_loss = 1.0 else: max_loss = 0.05 / (data_noise_stddev**2) train_data, test_data = self.create_data( noise_stddev=data_noise_stddev, anomaly_prob=anomaly_prob, multiple_periods=multiple_periods) output_window_size = 10 window_size = input_window_size + output_window_size class _RunConfig(estimator_lib.RunConfig): @property def tf_random_seed(self): return 3 estimator = ARRegressor( periodicities=self.period, anomaly_prior_probability=0.01 if anomaly_distribution else None, anomaly_distribution=anomaly_distribution, num_features=2, output_window_size=output_window_size, num_time_buckets=20, input_window_size=input_window_size, hidden_layer_sizes=[16], loss=loss, config=_RunConfig()) train_input_fn = input_pipeline.RandomWindowInputFn( time_series_reader=input_pipeline.NumpyReader(train_data), window_size=window_size, batch_size=64, num_threads=1, shuffle_seed=2) test_input_fn = test_utils.AllWindowInputFn( time_series_reader=input_pipeline.NumpyReader(test_data), window_size=window_size) # Test training estimator.train(input_fn=train_input_fn, steps=train_steps) test_evaluation = estimator.evaluate(input_fn=test_input_fn, steps=1) test_loss = test_evaluation["loss"] logging.info("Final test loss: %f", test_loss) self.assertLess(test_loss, max_loss) if loss == ar_model.ARModel.SQUARED_LOSS: # Test that the evaluation loss is reported without input scaling. self.assertAllClose( test_loss, np.mean((test_evaluation["mean"] - test_evaluation["observed"])**2)) # Test predict train_data_times = train_data[TrainEvalFeatures.TIMES] train_data_values = train_data[TrainEvalFeatures.VALUES] test_data_times = test_data[TrainEvalFeatures.TIMES] test_data_values = test_data[TrainEvalFeatures.VALUES] predict_times = np.expand_dims( np.concatenate( [train_data_times[input_window_size:], test_data_times]), 0) predict_true_values = np.expand_dims( np.concatenate( [train_data_values[input_window_size:], test_data_values]), 0) state_times = np.expand_dims(train_data_times[:input_window_size], 0) state_values = np.expand_dims(train_data_values[:input_window_size, :], 0) state_exogenous = state_times[:, :, None][:, :, :0] def prediction_input_fn(): return ({ PredictionFeatures.TIMES: training.limit_epochs(predict_times, num_epochs=1), PredictionFeatures.STATE_TUPLE: (state_times, state_values, state_exogenous) }, {}) (predictions, ) = tuple( estimator.predict(input_fn=prediction_input_fn)) predicted_mean = predictions["mean"][:, 0] true_values = predict_true_values[0, :, 0] if loss == ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS: variances = predictions["covariance"][:, 0] standard_deviations = np.sqrt(variances) # Note that we may get tighter bounds with more training steps. errors = np.abs(predicted_mean - true_values) > 4 * standard_deviations fraction_errors = np.mean(errors) logging.info("Fraction errors: %f", fraction_errors)