class BaseHook(SessionRunHook): """docstring for BaseHook""" def __init__( self, every_n_steps=None, every_n_secs=None): super(BaseHook, self).__init__() validate_every_n(every_n_steps, every_n_secs) self._timer = SecondOrStepTimer( every_secs=every_n_secs, every_steps=every_n_steps) def begin(self): self._timer.reset() self._iter_count = 0 def before_run(self, run_context): # pylint: disable=unused-argument self._should_trigger = self._timer.should_trigger_for_step( self._iter_count) def after_run(self, run_context, run_values): _ = run_context _ = run_values if self._should_trigger: self._triggered_action() self._iter_count += 1 @abstractmethod def _triggered_action(self): pass
def __init__(self, estimator, dev_features, dev_label, dev_cid, max_seq_length, eval_steps=None, checkpoint_dir=None, model_name=None, _input_fn_builder=None, checkpoint_basename="eval.log"): logging.info("Create EvalHook.") self.estimator = estimator self.dev_features = dev_features self.dev_label = dev_label self.dev_cid = dev_cid self.max_seq_length = max_seq_length self._checkpoint_dir = checkpoint_dir if os.path.exists('./EVAL_LOG') is False: os.mkdir('./EVAL_LOG') self._save_path = os.path.join('./EVAL_LOG', model_name+'_log') self._timer = SecondOrStepTimer(every_steps=eval_steps) self._steps_per_run = 1 self._global_step_tensor = None if _input_fn_builder is not None: self.input_fn_builder = _input_fn_builder else: self.input_fn_builder = input_fn_builder
def __init__(self, save_steps=None, save_secs=None, output_dir=""): self._output_tag = "blah-{}" self._output_dir = output_dir self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) self._atomic_counter = 0 self.start_time = None
def __init__(self, scale=1, every_n_steps=100, every_n_secs=None, output_dir=None, summary_writer=None, summary_train_op=None, summary_test_op=None, summary_evaluator=None, test_every_n_steps=None, local_step_tensor=None): if (every_n_steps is None) == (every_n_secs is None): raise ValueError( "exactly one of every_n_steps and every_n_secs should be provided." ) self._timer = SecondOrStepTimer(every_steps=every_n_steps, every_secs=every_n_secs) self._summary_writer = summary_writer self._output_dir = output_dir self._last_global_step = 0 self._last_local_step = None self._scale = scale self._summary_train_op = summary_train_op self._summary_test_op = summary_test_op self._summary_evaluator = summary_evaluator self._test_every_n_steps = test_every_n_steps self._local_step_tensor = local_step_tensor self._exec_count = 0
class RunAfterCheckpointHook(session_run_hook.SessionRunHook): """ Runs a certain callback function right after a checkpoint has been saved. We use this to generate some text at regular intervals during the training to show the progress. Note that it restores the model from a checkpoint, which is why it needs to happen with the same interval as checkpoint saving. """ def __init__(self, run_config, callback): self._timer = SecondOrStepTimer( every_secs=run_config.save_checkpoints_secs, every_steps=run_config.save_checkpoints_steps) self.callback = callback self.is_first_run = True def begin(self): self._global_step_tensor = training_util._get_or_create_global_step_read( ) # pylint: disable=protected-access def after_run(self, run_context, run_values): global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): self._timer.update_last_triggered_step(global_step) # the timer will tell us that it needs to trigger on the very first run, which does not make sense. if not self.is_first_run: self.callback() else: self.is_first_run = False
def __init__(self, tensors, every_n_iter=None, every_n_secs=None, formatter=None): """Initializes a LoggingHook monitor. Args: tensors: `dict` that maps string-valued tags to tensors/tensor names, or `iterable` of tensors/tensor names. every_n_iter: `int`, print the values of `tensors` once every N local steps taken on the current worker. every_n_secs: `int` or `float`, print the values of `tensors` once every N seconds. Exactly one of `every_n_iter` and `every_n_secs` should be provided. formatter: function, takes dict of `tag`->`Tensor` and returns a string. If `None` uses default printing all tensors. Raises: ValueError: if `every_n_iter` is non-positive. """ if (every_n_iter is None) == (every_n_secs is None): raise ValueError( "exactly one of every_n_iter and every_n_secs must be provided.") if every_n_iter is not None and every_n_iter <= 0: raise ValueError("invalid every_n_iter=%s." % every_n_iter) if not isinstance(tensors, dict): self._tag_order = tensors tensors = {item: item for item in tensors} else: self._tag_order = tensors.keys() self._tensors = tensors self._formatter = formatter self._timer = SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
class BlastHook(session_run_hook.SessionRunHook): """Hook that counts steps per second.""" def __init__(self, summary, config, id_to_enzyme_class, every_n_steps=1200, every_n_secs=None, output_dir=None, summary_writer=None, n_examples=2, running_mode="train"): self._timer = SecondOrStepTimer(every_steps=every_n_steps, every_secs=every_n_secs) self.summary = summary self.config = config self.summary_writer = summary_writer self.output_dir = output_dir self.last_global_step = None self.id_to_enzyme_class = id_to_enzyme_class self.global_step_check_count = 0 self.steps_per_run = 1 self.n_examples = n_examples, self.running_mode = running_mode def _set_steps_per_run(self, steps_per_run): self.steps_per_run = steps_per_run def begin(self): if self.summary_writer is None and self.output_dir: self.summary_writer = SummaryWriterCache.get(self.output_dir) graph = ops.get_default_graph() self.fake_seq = graph.get_tensor_by_name("model/" + FAKE_PROTEINS + ":0") self.labels = graph.get_tensor_by_name("model/" + LABELS + ":0") self.d_score = graph.get_tensor_by_name("model/d_score:0") self.global_step_tensor = training_util._get_or_create_global_step_read( ) if self.global_step_tensor is None: raise RuntimeError("Could not global step tensor") if self.fake_seq is None: raise RuntimeError("Could not get fake seq tensor") def before_run(self, run_context): # pylint: disable=unused-argument return SessionRunArgs([ self.global_step_tensor, self.fake_seq, self.labels, self.d_score ]) def after_run(self, run_context, run_values): global_step, fake_seq, labels, d_score = run_values.results if self._timer.should_trigger_for_step(global_step): # fake_seq, real_seq, labels = run_context.session.run([self._fake_seq, self._real_seq, self._labels]) self.summary(self.config, self.summary_writer, global_step, fake_seq, labels, self.id_to_enzyme_class, self.n_examples[0], self.running_mode, d_score).start() self._timer.update_last_triggered_step(global_step)
def __init__(self, save_steps=None, save_secs=None, report_tensor_allocation_upon_oom=False, output_dir=""): self._output_tag = "step-{}" self._output_dir = output_dir self._report_tensor_allocation_upon_oom = report_tensor_allocation_upon_oom self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps)
def __init__(self, save_steps=None, save_secs=None, model_dir=""): self._output_tag = "step-{}" self._model_dir = model_dir self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) self._next_step = None self._global_step_tensor = None self._writer = None self._request_summary = None
class IntervalHook(tf.train.SessionRunHook): """ A hook which runs every # of iterations. Useful for subclassing. """ def __init__(self, interval): """ Construct the interval. :param interval: The interval. """ self.global_step = None self.interval = interval if interval is not None: self.timer = SecondOrStepTimer(every_steps=interval) else: self.timer = None def begin(self): self.global_step = tf.train.get_or_create_global_step() def before_run(self, run_context): return tf.train.SessionRunArgs( [self.global_step, *self.session_run_args(run_context)]) # noinspection PyMethodMayBeStatic, PyUnusedLocal def session_run_args(self, run_context): # pylint: disable=unused-argument """ Create the session run arguments. :param run_context: The run context. :return: The list of arguments to run. """ return list() def after_run(self, run_context, run_values): if self.interval is None: return global_step = run_values.results[0] if self.timer.should_trigger_for_step(global_step): self.timer.update_last_triggered_step(global_step) self.run_interval_operations(run_context, run_values.results[1:], global_step) @abc.abstract def run_interval_operations(self, run_context, results, global_step): """ The method to override. :param run_context: The run context. :param results: The results of running the given arguments. :param global_step: The evaluated global step tensor. """ pass
def __init__( self, every_n_steps=None, every_n_secs=None): super(BaseHook, self).__init__() validate_every_n(every_n_steps, every_n_secs) self._timer = SecondOrStepTimer( every_secs=every_n_secs, every_steps=every_n_steps)
def __init__(self, params, model_dir, run_config): super(TrainSampleHook, self).__init__(params, model_dir, run_config) self._sample_dir = os.path.join(self.model_dir, "samples") self._timer = SecondOrStepTimer( every_secs=self.params["every_n_secs"], every_steps=self.params["every_n_steps"]) self._pred_dict = {} self._should_trigger = False self._iter_count = 0 self._global_step = None self._source_delimiter = self.params["source_delimiter"] self._target_delimiter = self.params["target_delimiter"]
def __init__(self, interval): """ Construct the interval. :param interval: The interval. """ self.global_step = None self.interval = interval if interval is not None: self.timer = SecondOrStepTimer(every_steps=interval) else: self.timer = None
def __init__(self, params, model_dir, summary_writer=None): super(TokensPerSecondCounter, self).__init__(params, model_dir) self._summary_tag = "tokens/sec" self._timer = SecondOrStepTimer( every_steps=self.params["every_n_steps"], every_secs=self.params["every_n_secs"]) self._summary_writer = summary_writer if summary_writer is None and self.model_dir: self._summary_writer = SummaryWriterCache.get(self.model_dir) self._tokens_last_step = 0
def __init__(self, estimator, eval_features, max_seq_length, eval_steps, save_model_dir, th, output_dir): self.estimator = estimator self.eval_features = eval_features self.max_seq_length = max_seq_length self.eval_steps = eval_steps self.save_model_dir = save_model_dir self.th = th self.output_dir = output_dir if os.path.exists(self.save_model_dir) is False: os.mkdir(self.save_model_dir) self._timer = SecondOrStepTimer(every_steps=eval_steps) self._steps_per_run = 1 self._global_step_tensor = None
class BestSaverHook(tf.train.CheckpointSaverHook): def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): self.saver_listener = listeners[0] super(BestSaverHook, self).__init__(checkpoint_dir, save_secs, save_steps, saver, checkpoint_basename, scaffold, listeners) logging.info("Create CheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] print('__init__ listeners:{}, {}'.format(len(listeners), len(self._listeners))) # def after_run(self, run_context, run_values): # print('EarlyStoppingHook:{}'.format(run_values.results)) # super(EarlyStoppingHook, self).after_run(run_context, run_values) # if self.saver_listener.should_stop(): # run_context.request_stop() def after_run(self, run_context, run_values): # print('EarlyStoppingHook:{}'.format(run_values.results)) stale_global_step = run_values.results if self._timer.should_trigger_for_step(stale_global_step+1): global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): self._timer.update_last_triggered_step(global_step) self._save(run_context.session, global_step) if self.saver_listener.should_stop(run_context.session): print('early stop') run_context.request_stop() def _save(self, session, step): """Saves the latest checkpoint.""" self.saver_listener.before_save(session, step) self._get_saver().save(session, self._save_path, global_step=step) self._summary_writer.add_session_log(SessionLog(status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path), step) self.saver_listener.after_save(session, step) logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
class MetadataHook(SessionRunHook): def __init__ (self,save_steps=None,save_secs=None,output_dir=""): self._output_tag = "step-{}" self._output_dir = output_dir self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) def begin(self): self._next_step = None self._global_step_tensor = training_util.get_global_step() self._writer = tf.summary.FileWriter (self._output_dir, tf.get_default_graph()) if self._global_step_tensor is None: raise RuntimeError("Global step should be created to use ProfilerHook.") def before_run(self, run_context): self._request_summary = (self._next_step is None or self._timer.should_trigger_for_step(self._next_step)) requests = {"global_step": self._global_step_tensor} opts = (tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) if self._request_summary else None) return SessionRunArgs(requests, options=opts) def after_run(self, run_context, run_values): stale_global_step = run_values.results["global_step"] global_step = stale_global_step + 1 if self._request_summary: global_step = run_context.session.run(self._global_step_tensor) self._writer.add_run_metadata(run_values.run_metadata, self._output_tag.format(global_step)) self._writer.flush() self._next_step = global_step + 1 def end(self, session): self._writer.close()
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): self.saver_listener = listeners[0] super(BestSaverHook, self).__init__(checkpoint_dir, save_secs, save_steps, saver, checkpoint_basename, scaffold, listeners) logging.info("Create CheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] print('__init__ listeners:{}, {}'.format(len(listeners), len(self._listeners)))
def __init__(self, estimator, dev_features, dev_label, dev_cid, max_seq_length, th=82.0, eval_steps=None, checkpoint_dir=None, model_name=None, _input_fn_builder=None, tail_num=0, type_word=''): logging.info("Create EvalHook.") self.estimator = estimator self.dev_features = dev_features self.dev_label = dev_label self.dev_cid = dev_cid self.max_seq_length = max_seq_length self.th = th self._checkpoint_dir = checkpoint_dir if os.path.exists('./EVAL_LOG') is False: os.mkdir('./EVAL_LOG') self.model_name = model_name self.tail_num = tail_num self.org_dir = "CQA_" + type_word + self.model_name + "_{}".format( self.tail_num) self._log_save_path = os.path.join( './EVAL_LOG', model_name + '_' + type_word + '_log') self._save_path = checkpoint_dir if os.path.exists(self._save_path) is False: os.mkdir(self._save_path) self._timer = SecondOrStepTimer(every_steps=eval_steps) self._steps_per_run = 1 self._global_step_tensor = None self._saver = None if _input_fn_builder is not None: self.input_fn_builder = _input_fn_builder else: self.input_fn_builder = input_fn_builder
def __init__(self, id_to_enzyme_class, every_n_steps=1200, every_n_secs=None, output_dir=None, summary_writer=None, n_examples=2, running_mode="train"): self._timer = SecondOrStepTimer(every_steps=every_n_steps, every_secs=every_n_secs) self.summary_writer = summary_writer self.output_dir = output_dir self.last_global_step = None self.id_to_enzyme_class = id_to_enzyme_class self.global_step_check_count = 0 self.steps_per_run = 1 self.n_examples = n_examples, self.running_mode = running_mode
def __init__(self, every_n_iter=None, every_n_secs=None, at_end=False, formatter=None, logging_mode=LoggingMode.LAST, feed_name="logging_hook", replication_factor=1): """Initializes the hook. Args: every_n_iter: `int`, print the tensor values once every N steps. every_n_secs: `int` or `float`, print the tensor values once every N seconds. Exactly one of `every_n_iter` and `every_n_secs` should be provided (unless `at_end` is True). at_end: `bool` specifying whether to print the tensor values at the end of the run. formatter: function that takes a dict with tensor names and values and returns a string. If None, uses default formatting. logging_mode: `IPULoggingTensorHook.LoggingMode` that determines the behaviour when enqueuing multiple tensor values between dequeues (e.g. print all of them or only the last one). feed_name: `string`. The name of the outfeed queue. Must be unique. replication_factor: `int`, the number of replicas from which logging is performed. """ if (every_n_iter is not None) and (every_n_secs is not None): raise ValueError( "Cannot provide both every_n_iter and every_n_secs") if every_n_iter is None and every_n_secs is None and not at_end: raise ValueError( "Either every_n_iter, every_n_secs or at_end should be provided" ) only_log_at_end = (at_end and (every_n_iter is None) and (every_n_secs is None)) self._timer = (NeverTriggerTimer() if only_log_at_end else SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)) self._log_at_end = at_end self._formatter = formatter self._outfeed = ipu_outfeed_queue.IPUOutfeedQueue( feed_name=feed_name, outfeed_mode=logging_mode, replication_factor=replication_factor) self._dequeue_op = None self._deleter_op = None self._iter_count = 0
class MetadataHook(SessionRunHook): def __init__(self, save_steps=None, save_secs=None, output_dir=""): self._output_tag = "blah-{}" self._output_dir = output_dir self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) self._atomic_counter = 0 self.start_time = None def begin(self): self._next_step = None self._global_step_tensor = training_util.get_global_step() self._writer = tf.summary.FileWriter(self._output_dir, tf.get_default_graph()) if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use ProfilerHook.") def before_run(self, run_context): self._request_summary = (self._next_step is None or self._timer.should_trigger_for_step( self._next_step)) requests = {} #{"global_step": self._global_step_tensor} opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) self.start_time = time.time() date_time = datetime.datetime.utcfromtimestamp( self.start_time).strftime('%Y-%m-%d %H:%M:%S') tf.logging.info(f'Before Run: {date_time}') return SessionRunArgs(requests, options=opts) def after_run(self, run_context, run_values): tf.logging.info(f'Inference Time: {time.time() - self.start_time}') global_step = self._atomic_counter + 1 self._atomic_counter = self._atomic_counter + 1 # if self._request_summary: # tf.logging.error(f'global step is {global_step}, atomic counter is {self._atomic_counter}') # # fetched_timeline = timeline.Timeline(run_values.run_metadata.step_stats) # # chrome_trace = fetched_timeline.generate_chrome_trace_format() # # with open(os.path.join(self._output_dir, f'timeline_{global_step}.json'), 'w') as f: # # with tf.gfile.GFile(os.path.join(self._output_dir, f'timeline_{global_step}.json'), 'w') as f: # # f.write(chrome_trace) # self._writer.add_run_metadata(run_values.run_metadata, # self._output_tag.format(global_step)) # self._writer.flush() self._next_step = global_step + 1 def end(self, session): self._writer.close()
def __init__(self, save_steps=None, save_secs=None, output_dir="", show_dataflow=True, show_memory=False): """Initializes a hook that takes periodic profiling snapshots. Args: save_steps: `int`, save profile traces every N steps. Exactly one of `save_secs` and `save_steps` should be set. save_secs: `int`, save profile traces every N seconds. output_dir: `string`, the directory to save the profile traces to. Defaults to the current directory. show_dataflow: `bool`, if True, add flow events to the trace connecting producers and consumers of tensors. show_memory: `bool`, if True, add object snapshot events to the trace showing the sizes and lifetimes of tensors. """ self._output_file = os.path.join(output_dir, "timeline-{}.json") self._show_dataflow = show_dataflow self._show_memory = show_memory self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps)
def __init__(self, estimator, dev_file, org_dev_file, eval_features, eval_steps=100, max_seq_length=300, max_answer_length=15, checkpoint_dir=None, input_fn_builder=None, th=86, model_name=None): self.estimator = estimator self.max_seq_length = max_seq_length self.max_answer_length = max_answer_length self.dev_file = dev_file self.org_dev_file = org_dev_file self.eval_features = eval_features self.th = th self.checkpoint_dir = checkpoint_dir self.org_dir = model_name if os.path.exists("./EVAL_LOG") is False: os.mkdir("./EVAL_LOG") if os.path.exists(self.checkpoint_dir) is False: os.mkdir(self.checkpoint_dir) self._log_save_path = os.path.join("./EVAL_LOG", model_name) self.save_path = os.path.join(self.checkpoint_dir, model_name) if os.path.exists(self.save_path) is False: os.mkdir(self.save_path) self._timer = SecondOrStepTimer(every_steps=eval_steps) self._steps_per_run = 1 self._global_step_tensor = None self.input_fn_builder = input_fn_builder
def __init__(self, model, sample_img, img_path, every_n_iter=None, every_n_secs=None): ''' Args: model : In order to retrieve `model.epoch_id` and `model.batch_id` for naming. sample_img : `Tensor`, sample images to save. img_path: 'String', path containing the directory and filename prefix every_n_iter: `int`, save the sample images every N local steps. every_n_secs: `int` or `float`, save sample images every N seconds. Exactly one of `every_n_iter` and `every_n_secs` should be provided. ''' self.model = model self.sample_img = sample_img self.img_path = img_path # Calculate appropriate grid size automatically h = math.sqrt(sample_img.get_shape().as_list()[0]) w = math.ceil(h) self.grid_size = (int(h), w) self._timer = SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
def __init__(self, user_info, server_info, every_n_iter=None, every_n_secs=None, at_end=False): only_log_at_end = (at_end and (every_n_iter is None) and (every_n_secs is None)) if (not only_log_at_end and (every_n_iter is None) == (every_n_secs is None)): raise ValueError( "either at_end and/or exactly one of every_n_iter and every_n_secs " "must be provided.") if every_n_iter is not None and every_n_iter <= 0: raise ValueError("invalid every_n_iter=%s." % every_n_iter) self._timer = (NeverTriggerTimer() if only_log_at_end else SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)) self._log_at_end = at_end self._user_info = user_info self._server_info = server_info self._timer.reset() self._iter_count = 0
class EvalHook(SessionRunHook): def __init__(self, estimator, dev_features, dev_label, dev_cid, max_seq_length, eval_steps=None, checkpoint_dir=None, model_name=None, _input_fn_builder=None, checkpoint_basename="eval.log"): logging.info("Create EvalHook.") self.estimator = estimator self.dev_features = dev_features self.dev_label = dev_label self.dev_cid = dev_cid self.max_seq_length = max_seq_length self._checkpoint_dir = checkpoint_dir if os.path.exists('./EVAL_LOG') is False: os.mkdir('./EVAL_LOG') self._save_path = os.path.join('./EVAL_LOG', model_name+'_log') self._timer = SecondOrStepTimer(every_steps=eval_steps) self._steps_per_run = 1 self._global_step_tensor = None if _input_fn_builder is not None: self.input_fn_builder = _input_fn_builder else: self.input_fn_builder = input_fn_builder def _set_steps_per_run(self, steps_per_run): self._steps_per_run = steps_per_run def begin(self): self._global_step_tensor = get_global_step() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use EvalHook.") def before_run(self, run_context): # pylint: disable=unused-argument return SessionRunArgs(self._global_step_tensor) def after_run(self, run_context, run_values): stale_global_step = run_values.results if self._timer.should_trigger_for_step( stale_global_step + self._steps_per_run): # get the real value after train op. global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): self._timer.update_last_triggered_step(global_step) self.evaluation(global_step) def end(self, session): last_step = session.run(self._global_step_tensor) if last_step != self._timer.last_triggered_step(): self.evaluation(last_step) def evaluation(self, global_step): eval_input_fn = self.input_fn_builder( features=self.dev_features, seq_length=self.max_seq_length, is_training=False, drop_remainder=False) predictions = self.estimator.predict(eval_input_fn, yield_single_examples=False) res = np.concatenate([a for a in predictions], axis=0) metrics = PRF(np.array(self.dev_label), res.argmax(axis=-1)) print('\n Global step is : ', global_step) MAP, AvgRec, MRR = eval_reranker(self.dev_cid, self.dev_label, res[:, 0]) metrics['MAP'] = MAP metrics['AvgRec'] = AvgRec metrics['MRR'] = MRR metrics['global_step'] = global_step print_metrics(metrics, 'dev', save_dir=self._save_path)
class ProfilerHook(session_run_hook.SessionRunHook): """Captures CPU/GPU profiling information every N steps or seconds. This produces files called "timeline-<step>.json", which are in Chrome Trace format. For more information see: https://github.com/catapult-project/catapult/blob/master/tracing/README.md""" def __init__(self, save_steps=None, save_secs=None, output_dir="", show_dataflow=True, show_memory=False): """Initializes a hook that takes periodic profiling snapshots. Args: save_steps: `int`, save profile traces every N steps. Exactly one of `save_secs` and `save_steps` should be set. save_secs: `int`, save profile traces every N seconds. output_dir: `string`, the directory to save the profile traces to. Defaults to the current directory. show_dataflow: `bool`, if True, add flow events to the trace connecting producers and consumers of tensors. show_memory: `bool`, if True, add object snapshot events to the trace showing the sizes and lifetimes of tensors. """ self._output_file = os.path.join(output_dir, "timeline-{}.json") self._show_dataflow = show_dataflow self._show_memory = show_memory self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) def begin(self): self._next_step = None self._global_step_tensor = training_util.get_global_step() if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use ProfilerHook.") def before_run(self, run_context): self._request_summary = ( self._next_step is None or self._timer.should_trigger_for_step(self._next_step)) requests = {"global_step": self._global_step_tensor} opts = (config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE) if self._request_summary else None) return SessionRunArgs(requests, options=opts) def after_run(self, run_context, run_values): global_step = run_values.results["global_step"] if self._request_summary: self._timer.update_last_triggered_step(global_step) self._save(global_step, self._output_file.format(global_step), run_values.run_metadata.step_stats) self._next_step = global_step + 1 def _save(self, step, save_path, step_stats): logging.info("Saving timeline for %d into '%s'.", step, save_path) with gfile.Open(save_path, "w") as f: trace = timeline.Timeline(step_stats) f.write(trace.generate_chrome_trace_format( show_dataflow=self._show_dataflow, show_memory=self._show_memory))
class TrainSampleHook(TrainingHook): """Occasionally samples predictions from the training run and prints them. Params: every_n_secs: Sample predictions every N seconds. If set, `every_n_steps` must be None. every_n_steps: Sample predictions every N steps. If set, `every_n_secs` must be None. sample_dir: Optional, a directory to write samples to. delimiter: Join tokens on this delimiter. Defaults to space. """ #pylint: disable=missing-docstring def __init__(self, params, model_dir, run_config): super(TrainSampleHook, self).__init__(params, model_dir, run_config) self._sample_dir = os.path.join(self.model_dir, "samples") self._timer = SecondOrStepTimer( every_secs=self.params["every_n_secs"], every_steps=self.params["every_n_steps"]) self._pred_dict = {} self._should_trigger = False self._iter_count = 0 self._global_step = None self._source_delimiter = self.params["source_delimiter"] self._target_delimiter = self.params["target_delimiter"] @staticmethod def default_params(): return { "every_n_secs": None, "every_n_steps": 1000, "source_delimiter": " ", "target_delimiter": " " } def begin(self): self._iter_count = 0 self._global_step = tf.train.get_global_step() self._pred_dict = graph_utils.get_dict_from_collection("predictions") # Create the sample directory if self._sample_dir is not None: gfile.MakeDirs(self._sample_dir) def before_run(self, _run_context): self._should_trigger = self._timer.should_trigger_for_step(self._iter_count) if self._should_trigger: fetches = { "predicted_tokens": self._pred_dict["predicted_tokens"], "target_words": self._pred_dict["labels.target_tokens"], "target_len": self._pred_dict["labels.target_len"] } return tf.train.SessionRunArgs([fetches, self._global_step]) return tf.train.SessionRunArgs([{}, self._global_step]) def after_create_session(self, session, coord): print("Session created. Finalizing graph.") session.graph.finalize() def after_run(self, _run_context, run_values): result_dict, step = run_values.results self._iter_count = step if not self._should_trigger: return None # Convert dict of lists to list of dicts result_dicts = [ dict(zip(result_dict, t)) for t in zip(*result_dict.values()) ] # Print results result_str = "" result_str += "Prediction followed by Target @ Step {}\n".format(step) result_str += ("=" * 100) + "\n" for result in result_dicts: target_len = result["target_len"] predicted_slice = result["predicted_tokens"][:target_len - 1] target_slice = result["target_words"][1:target_len] result_str += self._target_delimiter.encode("utf-8").join( predicted_slice).decode("utf-8") + "\n" result_str += self._target_delimiter.encode("utf-8").join( target_slice).decode("utf-8") + "\n\n" result_str += ("=" * 100) + "\n\n" tf.logging.info(result_str) if self._sample_dir: filepath = os.path.join(self._sample_dir, "samples_{:06d}.txt".format(step)) with gfile.GFile(filepath, "w") as file: file.write(result_str) self._timer.update_last_triggered_step(self._iter_count - 1)
class TrainSampleHook(TrainingHook): """Occasionally samples predictions from the training run and prints them. Params: every_n_secs: Sample predictions every N seconds. If set, `every_n_steps` must be None. every_n_steps: Sample predictions every N steps. If set, `every_n_secs` must be None. sample_dir: Optional, a directory to write samples to. delimiter: Join tokens on this delimiter. Defaults to space. """ #pylint: disable=missing-docstring def __init__(self, params, model_dir, run_config): super(TrainSampleHook, self).__init__(params, model_dir, run_config) self._sample_dir = os.path.join(self.model_dir, "samples") self._timer = SecondOrStepTimer( every_secs=self.params["every_n_secs"], every_steps=self.params["every_n_steps"]) self._pred_dict = {} self._should_trigger = False self._iter_count = 0 self._global_step = None self._source_delimiter = self.params["source_delimiter"] self._target_delimiter = self.params["target_delimiter"] @staticmethod def default_params(): return { "every_n_secs": None, "every_n_steps": 1000, "source_delimiter": " ", "target_delimiter": " " } def begin(self): self._iter_count = 0 self._global_step = tf.train.get_global_step() self._pred_dict = graph_utils.get_dict_from_collection("predictions") # Create the sample directory if self._sample_dir is not None: gfile.MakeDirs(self._sample_dir) def before_run(self, _run_context): self._should_trigger = self._timer.should_trigger_for_step(self._iter_count) if self._should_trigger: fetches = { "predicted_tokens": self._pred_dict["predicted_tokens"], "target_words": self._pred_dict["labels.target_tokens"], "target_len": self._pred_dict["labels.target_len"] } return tf.train.SessionRunArgs([fetches, self._global_step]) return tf.train.SessionRunArgs([{}, self._global_step]) def after_run(self, _run_context, run_values): result_dict, step = run_values.results self._iter_count = step if not self._should_trigger: return None # Convert dict of lists to list of dicts result_dicts = [ dict(zip(result_dict, t)) for t in zip(*result_dict.values()) ] # Print results result_str = "" result_str += "Prediction followed by Target @ Step {}\n".format(step) result_str += ("=" * 100) + "\n" for result in result_dicts: target_len = result["target_len"] predicted_slice = result["predicted_tokens"][:target_len - 1] target_slice = result["target_words"][1:target_len] result_str += self._target_delimiter.encode("utf-8").join( predicted_slice).decode("utf-8") + "\n" result_str += self._target_delimiter.encode("utf-8").join( target_slice).decode("utf-8") + "\n\n" result_str += ("=" * 100) + "\n\n" tf.logging.info(result_str) if self._sample_dir: filepath = os.path.join(self._sample_dir, "samples_{:06d}.txt".format(step)) with gfile.GFile(filepath, "w") as file: file.write(result_str) self._timer.update_last_triggered_step(self._iter_count - 1)
def __init__(self, save_steps=None, save_secs=None, output_dir=""): self._output_tag = "step-{}" self._output_dir = output_dir self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps)