Esempio n. 1
0
class BaseHook(SessionRunHook):
    """docstring for BaseHook"""
    def __init__(
        self,
        every_n_steps=None,
        every_n_secs=None):

        super(BaseHook, self).__init__()

        validate_every_n(every_n_steps, every_n_secs)

        self._timer = SecondOrStepTimer(
            every_secs=every_n_secs,
            every_steps=every_n_steps)

    def begin(self):
        self._timer.reset()
        self._iter_count = 0

    def before_run(self, run_context):  # pylint: disable=unused-argument
        self._should_trigger = self._timer.should_trigger_for_step(
            self._iter_count)

    def after_run(self, run_context, run_values):
        _ = run_context
        _ = run_values
        if self._should_trigger:
            self._triggered_action()
        self._iter_count += 1

    @abstractmethod
    def _triggered_action(self):
        pass
Esempio n. 2
0
    def __init__(self,
                 estimator,
                 dev_features,
                 dev_label,
                 dev_cid,
                 max_seq_length,
                 eval_steps=None,
                 checkpoint_dir=None,
                 model_name=None,
                 _input_fn_builder=None,
                 checkpoint_basename="eval.log"):

        logging.info("Create EvalHook.")
        self.estimator = estimator
        self.dev_features = dev_features
        self.dev_label = dev_label
        self.dev_cid = dev_cid
        self.max_seq_length = max_seq_length
        self._checkpoint_dir = checkpoint_dir
        if os.path.exists('./EVAL_LOG') is False:
            os.mkdir('./EVAL_LOG')
        self._save_path = os.path.join('./EVAL_LOG', model_name+'_log')
        self._timer = SecondOrStepTimer(every_steps=eval_steps)
        self._steps_per_run = 1
        self._global_step_tensor = None

        if _input_fn_builder is not None:
            self.input_fn_builder = _input_fn_builder
        else:
            self.input_fn_builder = input_fn_builder
Esempio n. 3
0
 def __init__(self, save_steps=None, save_secs=None, output_dir=""):
     self._output_tag = "blah-{}"
     self._output_dir = output_dir
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
     self._atomic_counter = 0
     self.start_time = None
Esempio n. 4
0
    def __init__(self,
                 scale=1,
                 every_n_steps=100,
                 every_n_secs=None,
                 output_dir=None,
                 summary_writer=None,
                 summary_train_op=None,
                 summary_test_op=None,
                 summary_evaluator=None,
                 test_every_n_steps=None,
                 local_step_tensor=None):

        if (every_n_steps is None) == (every_n_secs is None):
            raise ValueError(
                "exactly one of every_n_steps and every_n_secs should be provided."
            )
        self._timer = SecondOrStepTimer(every_steps=every_n_steps,
                                        every_secs=every_n_secs)

        self._summary_writer = summary_writer
        self._output_dir = output_dir
        self._last_global_step = 0
        self._last_local_step = None
        self._scale = scale
        self._summary_train_op = summary_train_op
        self._summary_test_op = summary_test_op
        self._summary_evaluator = summary_evaluator
        self._test_every_n_steps = test_every_n_steps
        self._local_step_tensor = local_step_tensor
        self._exec_count = 0
class RunAfterCheckpointHook(session_run_hook.SessionRunHook):
    """ Runs a certain callback function right after a checkpoint has been saved. 
      We use this to generate some text at regular intervals during the training to show the progress. 
      Note that it restores the model from a checkpoint, which is why it needs to happen with the same 
      interval as checkpoint saving. """
    def __init__(self, run_config, callback):
        self._timer = SecondOrStepTimer(
            every_secs=run_config.save_checkpoints_secs,
            every_steps=run_config.save_checkpoints_steps)
        self.callback = callback
        self.is_first_run = True

    def begin(self):
        self._global_step_tensor = training_util._get_or_create_global_step_read(
        )  # pylint: disable=protected-access

    def after_run(self, run_context, run_values):
        global_step = run_context.session.run(self._global_step_tensor)

        if self._timer.should_trigger_for_step(global_step):
            self._timer.update_last_triggered_step(global_step)

            # the timer will tell us that it needs to trigger on the very first run, which does not make sense.
            if not self.is_first_run:
                self.callback()
            else:
                self.is_first_run = False
Esempio n. 6
0
 def __init__(self, tensors, every_n_iter=None, every_n_secs=None,
              formatter=None):
   """Initializes a LoggingHook monitor.
   Args:
     tensors: `dict` that maps string-valued tags to tensors/tensor names,
         or `iterable` of tensors/tensor names.
     every_n_iter: `int`, print the values of `tensors` once every N local
         steps taken on the current worker.
     every_n_secs: `int` or `float`, print the values of `tensors` once every N
         seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
         provided.
     formatter: function, takes dict of `tag`->`Tensor` and returns a string.
         If `None` uses default printing all tensors.
   Raises:
     ValueError: if `every_n_iter` is non-positive.
   """
   if (every_n_iter is None) == (every_n_secs is None):
     raise ValueError(
         "exactly one of every_n_iter and every_n_secs must be provided.")
   if every_n_iter is not None and every_n_iter <= 0:
     raise ValueError("invalid every_n_iter=%s." % every_n_iter)
   if not isinstance(tensors, dict):
     self._tag_order = tensors
     tensors = {item: item for item in tensors}
   else:
     self._tag_order = tensors.keys()
   self._tensors = tensors
   self._formatter = formatter
   self._timer = SecondOrStepTimer(every_secs=every_n_secs,
                                   every_steps=every_n_iter)
Esempio n. 7
0
class BlastHook(session_run_hook.SessionRunHook):
    """Hook that counts steps per second."""
    def __init__(self,
                 summary,
                 config,
                 id_to_enzyme_class,
                 every_n_steps=1200,
                 every_n_secs=None,
                 output_dir=None,
                 summary_writer=None,
                 n_examples=2,
                 running_mode="train"):

        self._timer = SecondOrStepTimer(every_steps=every_n_steps,
                                        every_secs=every_n_secs)
        self.summary = summary
        self.config = config
        self.summary_writer = summary_writer
        self.output_dir = output_dir
        self.last_global_step = None
        self.id_to_enzyme_class = id_to_enzyme_class
        self.global_step_check_count = 0
        self.steps_per_run = 1
        self.n_examples = n_examples,
        self.running_mode = running_mode

    def _set_steps_per_run(self, steps_per_run):
        self.steps_per_run = steps_per_run

    def begin(self):
        if self.summary_writer is None and self.output_dir:
            self.summary_writer = SummaryWriterCache.get(self.output_dir)
        graph = ops.get_default_graph()
        self.fake_seq = graph.get_tensor_by_name("model/" + FAKE_PROTEINS +
                                                 ":0")
        self.labels = graph.get_tensor_by_name("model/" + LABELS + ":0")
        self.d_score = graph.get_tensor_by_name("model/d_score:0")
        self.global_step_tensor = training_util._get_or_create_global_step_read(
        )
        if self.global_step_tensor is None:
            raise RuntimeError("Could not global step tensor")
        if self.fake_seq is None:
            raise RuntimeError("Could not get fake seq tensor")

    def before_run(self, run_context):  # pylint: disable=unused-argument
        return SessionRunArgs([
            self.global_step_tensor, self.fake_seq, self.labels, self.d_score
        ])

    def after_run(self, run_context, run_values):
        global_step, fake_seq, labels, d_score = run_values.results
        if self._timer.should_trigger_for_step(global_step):
            # fake_seq, real_seq, labels = run_context.session.run([self._fake_seq, self._real_seq, self._labels])
            self.summary(self.config, self.summary_writer, global_step,
                         fake_seq, labels, self.id_to_enzyme_class,
                         self.n_examples[0], self.running_mode,
                         d_score).start()
            self._timer.update_last_triggered_step(global_step)
Esempio n. 8
0
 def __init__(self,
              save_steps=None,
              save_secs=None,
              report_tensor_allocation_upon_oom=False,
              output_dir=""):
     self._output_tag = "step-{}"
     self._output_dir = output_dir
     self._report_tensor_allocation_upon_oom = report_tensor_allocation_upon_oom
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
Esempio n. 9
0
    def __init__(self, save_steps=None, save_secs=None, model_dir=""):
        self._output_tag = "step-{}"
        self._model_dir = model_dir
        self._timer = SecondOrStepTimer(every_secs=save_secs,
                                        every_steps=save_steps)

        self._next_step = None
        self._global_step_tensor = None
        self._writer = None
        self._request_summary = None
Esempio n. 10
0
class IntervalHook(tf.train.SessionRunHook):
    """
    A hook which runs every # of iterations. Useful for subclassing.
    """
    def __init__(self, interval):
        """
        Construct the interval.

        :param interval: The interval.
        """
        self.global_step = None
        self.interval = interval

        if interval is not None:
            self.timer = SecondOrStepTimer(every_steps=interval)
        else:
            self.timer = None

    def begin(self):
        self.global_step = tf.train.get_or_create_global_step()

    def before_run(self, run_context):
        return tf.train.SessionRunArgs(
            [self.global_step, *self.session_run_args(run_context)])

    # noinspection PyMethodMayBeStatic, PyUnusedLocal
    def session_run_args(self, run_context):  # pylint: disable=unused-argument
        """
        Create the session run arguments.

        :param run_context: The run context.
        :return: The list of arguments to run.
        """
        return list()

    def after_run(self, run_context, run_values):
        if self.interval is None:
            return

        global_step = run_values.results[0]
        if self.timer.should_trigger_for_step(global_step):
            self.timer.update_last_triggered_step(global_step)
            self.run_interval_operations(run_context, run_values.results[1:],
                                         global_step)

    @abc.abstract
    def run_interval_operations(self, run_context, results, global_step):
        """
        The method to override.

        :param run_context: The run context.
        :param results: The results of running the given arguments.
        :param global_step: The evaluated global step tensor.
        """
        pass
Esempio n. 11
0
    def __init__(
        self,
        every_n_steps=None,
        every_n_secs=None):

        super(BaseHook, self).__init__()

        validate_every_n(every_n_steps, every_n_secs)

        self._timer = SecondOrStepTimer(
            every_secs=every_n_secs,
            every_steps=every_n_steps)
Esempio n. 12
0
 def __init__(self, params, model_dir, run_config):
   super(TrainSampleHook, self).__init__(params, model_dir, run_config)
   self._sample_dir = os.path.join(self.model_dir, "samples")
   self._timer = SecondOrStepTimer(
       every_secs=self.params["every_n_secs"],
       every_steps=self.params["every_n_steps"])
   self._pred_dict = {}
   self._should_trigger = False
   self._iter_count = 0
   self._global_step = None
   self._source_delimiter = self.params["source_delimiter"]
   self._target_delimiter = self.params["target_delimiter"]
Esempio n. 13
0
    def __init__(self, interval):
        """
        Construct the interval.

        :param interval: The interval.
        """
        self.global_step = None
        self.interval = interval

        if interval is not None:
            self.timer = SecondOrStepTimer(every_steps=interval)
        else:
            self.timer = None
Esempio n. 14
0
    def __init__(self, params, model_dir, summary_writer=None):
        super(TokensPerSecondCounter, self).__init__(params, model_dir)

        self._summary_tag = "tokens/sec"
        self._timer = SecondOrStepTimer(
            every_steps=self.params["every_n_steps"],
            every_secs=self.params["every_n_secs"])

        self._summary_writer = summary_writer
        if summary_writer is None and self.model_dir:
            self._summary_writer = SummaryWriterCache.get(self.model_dir)

        self._tokens_last_step = 0
Esempio n. 15
0
    def __init__(self, estimator, eval_features, max_seq_length, eval_steps,
                 save_model_dir, th, output_dir):
        self.estimator = estimator
        self.eval_features = eval_features
        self.max_seq_length = max_seq_length
        self.eval_steps = eval_steps
        self.save_model_dir = save_model_dir
        self.th = th
        self.output_dir = output_dir

        if os.path.exists(self.save_model_dir) is False:
            os.mkdir(self.save_model_dir)
        self._timer = SecondOrStepTimer(every_steps=eval_steps)
        self._steps_per_run = 1
        self._global_step_tensor = None
Esempio n. 16
0
class BestSaverHook(tf.train.CheckpointSaverHook):

    def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None,
                 checkpoint_basename="model.ckpt", scaffold=None, listeners=None):

        self.saver_listener = listeners[0]
        super(BestSaverHook, self).__init__(checkpoint_dir, save_secs, save_steps, saver,
                                            checkpoint_basename, scaffold, listeners)

        logging.info("Create CheckpointSaverHook.")
        if saver is not None and scaffold is not None:
            raise ValueError("You cannot provide both saver and scaffold.")
        self._saver = saver
        self._checkpoint_dir = checkpoint_dir
        self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
        self._scaffold = scaffold
        self._timer = SecondOrStepTimer(every_secs=save_secs,
                                        every_steps=save_steps)
        self._listeners = listeners or []

        print('__init__ listeners:{}, {}'.format(len(listeners), len(self._listeners)))

    # def after_run(self, run_context, run_values):
    #     print('EarlyStoppingHook:{}'.format(run_values.results))
    #     super(EarlyStoppingHook, self).after_run(run_context, run_values)
    #     if self.saver_listener.should_stop():
    #         run_context.request_stop()

    def after_run(self, run_context, run_values):
        # print('EarlyStoppingHook:{}'.format(run_values.results))
        stale_global_step = run_values.results
        if self._timer.should_trigger_for_step(stale_global_step+1):
            global_step = run_context.session.run(self._global_step_tensor)
            if self._timer.should_trigger_for_step(global_step):
                self._timer.update_last_triggered_step(global_step)
                self._save(run_context.session, global_step)

        if self.saver_listener.should_stop(run_context.session):
            print('early stop')
            run_context.request_stop()

    def _save(self, session, step):
        """Saves the latest checkpoint."""
        self.saver_listener.before_save(session, step)
        self._get_saver().save(session, self._save_path, global_step=step)
        self._summary_writer.add_session_log(SessionLog(status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path), step)
        self.saver_listener.after_save(session, step)
        logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
Esempio n. 17
0
class MetadataHook(SessionRunHook):
  def __init__ (self,save_steps=None,save_secs=None,output_dir=""):
    self._output_tag = "step-{}"
    self._output_dir = output_dir
    self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps)

  def begin(self):
    self._next_step = None
    self._global_step_tensor = training_util.get_global_step()
    self._writer = tf.summary.FileWriter (self._output_dir, tf.get_default_graph())

    if self._global_step_tensor is None:
      raise RuntimeError("Global step should be created to use ProfilerHook.")

  def before_run(self, run_context):
    self._request_summary = (self._next_step is None or self._timer.should_trigger_for_step(self._next_step))
    requests = {"global_step": self._global_step_tensor}
    opts = (tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) if self._request_summary else None)
    return SessionRunArgs(requests, options=opts)

  def after_run(self, run_context, run_values):
    stale_global_step = run_values.results["global_step"]
    global_step = stale_global_step + 1
    if self._request_summary:
      global_step = run_context.session.run(self._global_step_tensor)
      self._writer.add_run_metadata(run_values.run_metadata, self._output_tag.format(global_step))
      self._writer.flush()
    self._next_step = global_step + 1

  def end(self, session):
    self._writer.close()
Esempio n. 18
0
    def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None,
                 checkpoint_basename="model.ckpt", scaffold=None, listeners=None):

        self.saver_listener = listeners[0]
        super(BestSaverHook, self).__init__(checkpoint_dir, save_secs, save_steps, saver,
                                            checkpoint_basename, scaffold, listeners)

        logging.info("Create CheckpointSaverHook.")
        if saver is not None and scaffold is not None:
            raise ValueError("You cannot provide both saver and scaffold.")
        self._saver = saver
        self._checkpoint_dir = checkpoint_dir
        self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
        self._scaffold = scaffold
        self._timer = SecondOrStepTimer(every_secs=save_secs,
                                        every_steps=save_steps)
        self._listeners = listeners or []

        print('__init__ listeners:{}, {}'.format(len(listeners), len(self._listeners)))
Esempio n. 19
0
    def __init__(self,
                 estimator,
                 dev_features,
                 dev_label,
                 dev_cid,
                 max_seq_length,
                 th=82.0,
                 eval_steps=None,
                 checkpoint_dir=None,
                 model_name=None,
                 _input_fn_builder=None,
                 tail_num=0,
                 type_word=''):
        logging.info("Create EvalHook.")
        self.estimator = estimator
        self.dev_features = dev_features
        self.dev_label = dev_label
        self.dev_cid = dev_cid
        self.max_seq_length = max_seq_length
        self.th = th
        self._checkpoint_dir = checkpoint_dir
        if os.path.exists('./EVAL_LOG') is False:
            os.mkdir('./EVAL_LOG')
        self.model_name = model_name
        self.tail_num = tail_num
        self.org_dir = "CQA_" + type_word + self.model_name + "_{}".format(
            self.tail_num)

        self._log_save_path = os.path.join(
            './EVAL_LOG', model_name + '_' + type_word + '_log')
        self._save_path = checkpoint_dir
        if os.path.exists(self._save_path) is False:
            os.mkdir(self._save_path)
        self._timer = SecondOrStepTimer(every_steps=eval_steps)
        self._steps_per_run = 1
        self._global_step_tensor = None
        self._saver = None

        if _input_fn_builder is not None:
            self.input_fn_builder = _input_fn_builder
        else:
            self.input_fn_builder = input_fn_builder
Esempio n. 20
0
    def __init__(self,
                 id_to_enzyme_class,
                 every_n_steps=1200,
                 every_n_secs=None,
                 output_dir=None,
                 summary_writer=None,
                 n_examples=2,
                 running_mode="train"):

        self._timer = SecondOrStepTimer(every_steps=every_n_steps,
                                        every_secs=every_n_secs)

        self.summary_writer = summary_writer
        self.output_dir = output_dir
        self.last_global_step = None
        self.id_to_enzyme_class = id_to_enzyme_class
        self.global_step_check_count = 0
        self.steps_per_run = 1
        self.n_examples = n_examples,
        self.running_mode = running_mode
Esempio n. 21
0
 def __init__(self, params, model_dir, run_config):
   super(TrainSampleHook, self).__init__(params, model_dir, run_config)
   self._sample_dir = os.path.join(self.model_dir, "samples")
   self._timer = SecondOrStepTimer(
       every_secs=self.params["every_n_secs"],
       every_steps=self.params["every_n_steps"])
   self._pred_dict = {}
   self._should_trigger = False
   self._iter_count = 0
   self._global_step = None
   self._source_delimiter = self.params["source_delimiter"]
   self._target_delimiter = self.params["target_delimiter"]
Esempio n. 22
0
    def __init__(self,
                 every_n_iter=None,
                 every_n_secs=None,
                 at_end=False,
                 formatter=None,
                 logging_mode=LoggingMode.LAST,
                 feed_name="logging_hook",
                 replication_factor=1):
        """Initializes the hook.

    Args:
      every_n_iter: `int`, print the tensor values once every N steps.
      every_n_secs: `int` or `float`, print the tensor values once every N
        seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
        provided (unless `at_end` is True).
      at_end: `bool` specifying whether to print the tensor values at the
        end of the run.
      formatter: function that takes a dict with tensor names and values and
        returns a string. If None, uses default formatting.
      logging_mode: `IPULoggingTensorHook.LoggingMode` that determines the
        behaviour when enqueuing multiple tensor values between dequeues
        (e.g. print all of them or only the last one).
      feed_name: `string`. The name of the outfeed queue. Must be unique.
      replication_factor: `int`, the number of replicas from which logging
        is performed.
    """
        if (every_n_iter is not None) and (every_n_secs is not None):
            raise ValueError(
                "Cannot provide both every_n_iter and every_n_secs")
        if every_n_iter is None and every_n_secs is None and not at_end:
            raise ValueError(
                "Either every_n_iter, every_n_secs or at_end should be provided"
            )

        only_log_at_end = (at_end and (every_n_iter is None)
                           and (every_n_secs is None))

        self._timer = (NeverTriggerTimer() if only_log_at_end else
                       SecondOrStepTimer(every_secs=every_n_secs,
                                         every_steps=every_n_iter))
        self._log_at_end = at_end
        self._formatter = formatter

        self._outfeed = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name,
            outfeed_mode=logging_mode,
            replication_factor=replication_factor)

        self._dequeue_op = None
        self._deleter_op = None
        self._iter_count = 0
Esempio n. 23
0
class MetadataHook(SessionRunHook):
    def __init__(self, save_steps=None, save_secs=None, output_dir=""):
        self._output_tag = "blah-{}"
        self._output_dir = output_dir
        self._timer = SecondOrStepTimer(every_secs=save_secs,
                                        every_steps=save_steps)
        self._atomic_counter = 0
        self.start_time = None

    def begin(self):
        self._next_step = None
        self._global_step_tensor = training_util.get_global_step()
        self._writer = tf.summary.FileWriter(self._output_dir,
                                             tf.get_default_graph())

        if self._global_step_tensor is None:
            raise RuntimeError(
                "Global step should be created to use ProfilerHook.")

    def before_run(self, run_context):
        self._request_summary = (self._next_step is None
                                 or self._timer.should_trigger_for_step(
                                     self._next_step))
        requests = {}  #{"global_step": self._global_step_tensor}
        opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        self.start_time = time.time()
        date_time = datetime.datetime.utcfromtimestamp(
            self.start_time).strftime('%Y-%m-%d %H:%M:%S')
        tf.logging.info(f'Before Run: {date_time}')
        return SessionRunArgs(requests, options=opts)

    def after_run(self, run_context, run_values):
        tf.logging.info(f'Inference Time: {time.time() - self.start_time}')
        global_step = self._atomic_counter + 1
        self._atomic_counter = self._atomic_counter + 1
        #   if self._request_summary:
        #       tf.logging.error(f'global step is {global_step}, atomic counter is {self._atomic_counter}')
        #     #   fetched_timeline = timeline.Timeline(run_values.run_metadata.step_stats)
        #     #   chrome_trace = fetched_timeline.generate_chrome_trace_format()
        #     #   with open(os.path.join(self._output_dir, f'timeline_{global_step}.json'), 'w') as f:
        #     #   with tf.gfile.GFile(os.path.join(self._output_dir, f'timeline_{global_step}.json'), 'w') as f:
        #     #     f.write(chrome_trace)

        #       self._writer.add_run_metadata(run_values.run_metadata,
        #                                     self._output_tag.format(global_step))
        #       self._writer.flush()
        self._next_step = global_step + 1

    def end(self, session):
        self._writer.close()
Esempio n. 24
0
 def __init__(self,
              save_steps=None,
              save_secs=None,
              output_dir="",
              show_dataflow=True,
              show_memory=False):
     """Initializes a hook that takes periodic profiling snapshots.
 Args:
   save_steps: `int`, save profile traces every N steps. Exactly one of
       `save_secs` and `save_steps` should be set.
   save_secs: `int`, save profile traces every N seconds.
   output_dir: `string`, the directory to save the profile traces to.
       Defaults to the current directory.
   show_dataflow: `bool`, if True, add flow events to the trace connecting
       producers and consumers of tensors.
   show_memory: `bool`, if True, add object snapshot events to the trace
       showing the sizes and lifetimes of tensors.
 """
     self._output_file = os.path.join(output_dir, "timeline-{}.json")
     self._show_dataflow = show_dataflow
     self._show_memory = show_memory
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
    def __init__(self,
                 estimator,
                 dev_file,
                 org_dev_file,
                 eval_features,
                 eval_steps=100,
                 max_seq_length=300,
                 max_answer_length=15,
                 checkpoint_dir=None,
                 input_fn_builder=None,
                 th=86,
                 model_name=None):
        self.estimator = estimator
        self.max_seq_length = max_seq_length
        self.max_answer_length = max_answer_length
        self.dev_file = dev_file
        self.org_dev_file = org_dev_file
        self.eval_features = eval_features
        self.th = th
        self.checkpoint_dir = checkpoint_dir
        self.org_dir = model_name
        if os.path.exists("./EVAL_LOG") is False:
            os.mkdir("./EVAL_LOG")

        if os.path.exists(self.checkpoint_dir) is False:
            os.mkdir(self.checkpoint_dir)
        self._log_save_path = os.path.join("./EVAL_LOG", model_name)
        self.save_path = os.path.join(self.checkpoint_dir, model_name)
        if os.path.exists(self.save_path) is False:
            os.mkdir(self.save_path)

        self._timer = SecondOrStepTimer(every_steps=eval_steps)
        self._steps_per_run = 1
        self._global_step_tensor = None

        self.input_fn_builder = input_fn_builder
Esempio n. 26
0
 def __init__(self,
              model,
              sample_img,
              img_path,
              every_n_iter=None,
              every_n_secs=None):
     '''
     Args:
       model : In order to retrieve `model.epoch_id` and `model.batch_id` for naming.
       sample_img : `Tensor`, sample images to save.
       img_path: 'String', path containing the directory and filename prefix
       every_n_iter: `int`, save the sample images every N local steps.
       every_n_secs: `int` or `float`, save sample images every N seconds. 
             Exactly one of `every_n_iter` and `every_n_secs` should be provided.
     '''
     self.model = model
     self.sample_img = sample_img
     self.img_path = img_path
     # Calculate appropriate grid size automatically
     h = math.sqrt(sample_img.get_shape().as_list()[0])
     w = math.ceil(h)
     self.grid_size = (int(h), w)
     self._timer = SecondOrStepTimer(every_secs=every_n_secs,
                                     every_steps=every_n_iter)
Esempio n. 27
0
    def __init__(self,
                 user_info,
                 server_info,
                 every_n_iter=None,
                 every_n_secs=None,
                 at_end=False):
        only_log_at_end = (at_end and (every_n_iter is None)
                           and (every_n_secs is None))
        if (not only_log_at_end
                and (every_n_iter is None) == (every_n_secs is None)):
            raise ValueError(
                "either at_end and/or exactly one of every_n_iter and every_n_secs "
                "must be provided.")
        if every_n_iter is not None and every_n_iter <= 0:
            raise ValueError("invalid every_n_iter=%s." % every_n_iter)

        self._timer = (NeverTriggerTimer() if only_log_at_end else
                       SecondOrStepTimer(every_secs=every_n_secs,
                                         every_steps=every_n_iter))
        self._log_at_end = at_end
        self._user_info = user_info
        self._server_info = server_info
        self._timer.reset()
        self._iter_count = 0
Esempio n. 28
0
  def __init__(self,
               save_steps=None,
               save_secs=None,
               output_dir="",
               show_dataflow=True,
               show_memory=False):
    """Initializes a hook that takes periodic profiling snapshots.

    Args:
      save_steps: `int`, save profile traces every N steps. Exactly one of
          `save_secs` and `save_steps` should be set.
      save_secs: `int`, save profile traces every N seconds.
      output_dir: `string`, the directory to save the profile traces to.
          Defaults to the current directory.
      show_dataflow: `bool`, if True, add flow events to the trace connecting
          producers and consumers of tensors.
      show_memory: `bool`, if True, add object snapshot events to the trace
          showing the sizes and lifetimes of tensors.
    """
    self._output_file = os.path.join(output_dir, "timeline-{}.json")
    self._show_dataflow = show_dataflow
    self._show_memory = show_memory
    self._timer = SecondOrStepTimer(every_secs=save_secs,
                                    every_steps=save_steps)
Esempio n. 29
0
class EvalHook(SessionRunHook):
    def __init__(self,
                 estimator,
                 dev_features,
                 dev_label,
                 dev_cid,
                 max_seq_length,
                 eval_steps=None,
                 checkpoint_dir=None,
                 model_name=None,
                 _input_fn_builder=None,
                 checkpoint_basename="eval.log"):

        logging.info("Create EvalHook.")
        self.estimator = estimator
        self.dev_features = dev_features
        self.dev_label = dev_label
        self.dev_cid = dev_cid
        self.max_seq_length = max_seq_length
        self._checkpoint_dir = checkpoint_dir
        if os.path.exists('./EVAL_LOG') is False:
            os.mkdir('./EVAL_LOG')
        self._save_path = os.path.join('./EVAL_LOG', model_name+'_log')
        self._timer = SecondOrStepTimer(every_steps=eval_steps)
        self._steps_per_run = 1
        self._global_step_tensor = None

        if _input_fn_builder is not None:
            self.input_fn_builder = _input_fn_builder
        else:
            self.input_fn_builder = input_fn_builder

    def _set_steps_per_run(self, steps_per_run):
        self._steps_per_run = steps_per_run

    def begin(self):
        self._global_step_tensor = get_global_step()  # pylint: disable=protected-access
        if self._global_step_tensor is None:
            raise RuntimeError(
                "Global step should be created to use EvalHook.")

    def before_run(self, run_context):  # pylint: disable=unused-argument
        return SessionRunArgs(self._global_step_tensor)

    def after_run(self, run_context, run_values):
        stale_global_step = run_values.results
        if self._timer.should_trigger_for_step(
                stale_global_step + self._steps_per_run):
            # get the real value after train op.
            global_step = run_context.session.run(self._global_step_tensor)
            if self._timer.should_trigger_for_step(global_step):
                self._timer.update_last_triggered_step(global_step)
                self.evaluation(global_step)

    def end(self, session):
        last_step = session.run(self._global_step_tensor)
        if last_step != self._timer.last_triggered_step():
            self.evaluation(last_step)

    def evaluation(self, global_step):
        eval_input_fn = self.input_fn_builder(
            features=self.dev_features,
            seq_length=self.max_seq_length,
            is_training=False,
            drop_remainder=False)

        predictions = self.estimator.predict(eval_input_fn, yield_single_examples=False)
        res = np.concatenate([a for a in predictions], axis=0)

        metrics = PRF(np.array(self.dev_label), res.argmax(axis=-1))

        print('\n Global step is : ', global_step)
        MAP, AvgRec, MRR = eval_reranker(self.dev_cid, self.dev_label, res[:, 0])

        metrics['MAP'] = MAP
        metrics['AvgRec'] = AvgRec
        metrics['MRR'] = MRR

        metrics['global_step'] = global_step

        print_metrics(metrics, 'dev', save_dir=self._save_path)
Esempio n. 30
0
class ProfilerHook(session_run_hook.SessionRunHook):
  """Captures CPU/GPU profiling information every N steps or seconds.

  This produces files called "timeline-<step>.json", which are in Chrome
  Trace format.

  For more information see:
  https://github.com/catapult-project/catapult/blob/master/tracing/README.md"""

  def __init__(self,
               save_steps=None,
               save_secs=None,
               output_dir="",
               show_dataflow=True,
               show_memory=False):
    """Initializes a hook that takes periodic profiling snapshots.

    Args:
      save_steps: `int`, save profile traces every N steps. Exactly one of
          `save_secs` and `save_steps` should be set.
      save_secs: `int`, save profile traces every N seconds.
      output_dir: `string`, the directory to save the profile traces to.
          Defaults to the current directory.
      show_dataflow: `bool`, if True, add flow events to the trace connecting
          producers and consumers of tensors.
      show_memory: `bool`, if True, add object snapshot events to the trace
          showing the sizes and lifetimes of tensors.
    """
    self._output_file = os.path.join(output_dir, "timeline-{}.json")
    self._show_dataflow = show_dataflow
    self._show_memory = show_memory
    self._timer = SecondOrStepTimer(every_secs=save_secs,
                                    every_steps=save_steps)

  def begin(self):
    self._next_step = None
    self._global_step_tensor = training_util.get_global_step()
    if self._global_step_tensor is None:
      raise RuntimeError(
          "Global step should be created to use ProfilerHook.")

  def before_run(self, run_context):
    self._request_summary = (
        self._next_step is None or
        self._timer.should_trigger_for_step(self._next_step))
    requests = {"global_step": self._global_step_tensor}
    opts = (config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
            if self._request_summary else None)

    return SessionRunArgs(requests, options=opts)

  def after_run(self, run_context, run_values):
    global_step = run_values.results["global_step"]

    if self._request_summary:
      self._timer.update_last_triggered_step(global_step)
      self._save(global_step,
                 self._output_file.format(global_step),
                 run_values.run_metadata.step_stats)

    self._next_step = global_step + 1

  def _save(self, step, save_path, step_stats):
    logging.info("Saving timeline for %d into '%s'.", step, save_path)
    with gfile.Open(save_path, "w") as f:
      trace = timeline.Timeline(step_stats)
      f.write(trace.generate_chrome_trace_format(
          show_dataflow=self._show_dataflow,
          show_memory=self._show_memory))
Esempio n. 31
0
class TrainSampleHook(TrainingHook):
  """Occasionally samples predictions from the training run and prints them.

  Params:
    every_n_secs: Sample predictions every N seconds.
      If set, `every_n_steps` must be None.
    every_n_steps: Sample predictions every N steps.
      If set, `every_n_secs` must be None.
    sample_dir: Optional, a directory to write samples to.
    delimiter: Join tokens on this delimiter. Defaults to space.
  """

  #pylint: disable=missing-docstring

  def __init__(self, params, model_dir, run_config):
    super(TrainSampleHook, self).__init__(params, model_dir, run_config)
    self._sample_dir = os.path.join(self.model_dir, "samples")
    self._timer = SecondOrStepTimer(
        every_secs=self.params["every_n_secs"],
        every_steps=self.params["every_n_steps"])
    self._pred_dict = {}
    self._should_trigger = False
    self._iter_count = 0
    self._global_step = None
    self._source_delimiter = self.params["source_delimiter"]
    self._target_delimiter = self.params["target_delimiter"]

  @staticmethod
  def default_params():
    return {
        "every_n_secs": None,
        "every_n_steps": 1000,
        "source_delimiter": " ",
        "target_delimiter": " "
    }

  def begin(self):
    self._iter_count = 0
    self._global_step = tf.train.get_global_step()
    self._pred_dict = graph_utils.get_dict_from_collection("predictions")
    # Create the sample directory
    if self._sample_dir is not None:
      gfile.MakeDirs(self._sample_dir)

  def before_run(self, _run_context):
    self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
    if self._should_trigger:
      fetches = {
          "predicted_tokens": self._pred_dict["predicted_tokens"],
          "target_words": self._pred_dict["labels.target_tokens"],
          "target_len": self._pred_dict["labels.target_len"]
      }
      return tf.train.SessionRunArgs([fetches, self._global_step])
    return tf.train.SessionRunArgs([{}, self._global_step])

  def after_create_session(self, session, coord):
    print("Session created. Finalizing graph.")
    session.graph.finalize()

  def after_run(self, _run_context, run_values):
    result_dict, step = run_values.results
    self._iter_count = step

    if not self._should_trigger:
      return None

    # Convert dict of lists to list of dicts
    result_dicts = [
        dict(zip(result_dict, t)) for t in zip(*result_dict.values())
    ]

    # Print results
    result_str = ""
    result_str += "Prediction followed by Target @ Step {}\n".format(step)
    result_str += ("=" * 100) + "\n"
    for result in result_dicts:
      target_len = result["target_len"]
      predicted_slice = result["predicted_tokens"][:target_len - 1]
      target_slice = result["target_words"][1:target_len]
      result_str += self._target_delimiter.encode("utf-8").join(
          predicted_slice).decode("utf-8") + "\n"
      result_str += self._target_delimiter.encode("utf-8").join(
          target_slice).decode("utf-8") + "\n\n"
    result_str += ("=" * 100) + "\n\n"
    tf.logging.info(result_str)
    if self._sample_dir:
      filepath = os.path.join(self._sample_dir,
                              "samples_{:06d}.txt".format(step))
      with gfile.GFile(filepath, "w") as file:
        file.write(result_str)
    self._timer.update_last_triggered_step(self._iter_count - 1)
Esempio n. 32
0
class TrainSampleHook(TrainingHook):
  """Occasionally samples predictions from the training run and prints them.

  Params:
    every_n_secs: Sample predictions every N seconds.
      If set, `every_n_steps` must be None.
    every_n_steps: Sample predictions every N steps.
      If set, `every_n_secs` must be None.
    sample_dir: Optional, a directory to write samples to.
    delimiter: Join tokens on this delimiter. Defaults to space.
  """

  #pylint: disable=missing-docstring

  def __init__(self, params, model_dir, run_config):
    super(TrainSampleHook, self).__init__(params, model_dir, run_config)
    self._sample_dir = os.path.join(self.model_dir, "samples")
    self._timer = SecondOrStepTimer(
        every_secs=self.params["every_n_secs"],
        every_steps=self.params["every_n_steps"])
    self._pred_dict = {}
    self._should_trigger = False
    self._iter_count = 0
    self._global_step = None
    self._source_delimiter = self.params["source_delimiter"]
    self._target_delimiter = self.params["target_delimiter"]

  @staticmethod
  def default_params():
    return {
        "every_n_secs": None,
        "every_n_steps": 1000,
        "source_delimiter": " ",
        "target_delimiter": " "
    }

  def begin(self):
    self._iter_count = 0
    self._global_step = tf.train.get_global_step()
    self._pred_dict = graph_utils.get_dict_from_collection("predictions")
    # Create the sample directory
    if self._sample_dir is not None:
      gfile.MakeDirs(self._sample_dir)

  def before_run(self, _run_context):
    self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
    if self._should_trigger:
      fetches = {
          "predicted_tokens": self._pred_dict["predicted_tokens"],
          "target_words": self._pred_dict["labels.target_tokens"],
          "target_len": self._pred_dict["labels.target_len"]
      }
      return tf.train.SessionRunArgs([fetches, self._global_step])
    return tf.train.SessionRunArgs([{}, self._global_step])

  def after_run(self, _run_context, run_values):
    result_dict, step = run_values.results
    self._iter_count = step

    if not self._should_trigger:
      return None

    # Convert dict of lists to list of dicts
    result_dicts = [
        dict(zip(result_dict, t)) for t in zip(*result_dict.values())
    ]

    # Print results
    result_str = ""
    result_str += "Prediction followed by Target @ Step {}\n".format(step)
    result_str += ("=" * 100) + "\n"
    for result in result_dicts:
      target_len = result["target_len"]
      predicted_slice = result["predicted_tokens"][:target_len - 1]
      target_slice = result["target_words"][1:target_len]
      result_str += self._target_delimiter.encode("utf-8").join(
          predicted_slice).decode("utf-8") + "\n"
      result_str += self._target_delimiter.encode("utf-8").join(
          target_slice).decode("utf-8") + "\n\n"
    result_str += ("=" * 100) + "\n\n"
    tf.logging.info(result_str)
    if self._sample_dir:
      filepath = os.path.join(self._sample_dir,
                              "samples_{:06d}.txt".format(step))
      with gfile.GFile(filepath, "w") as file:
        file.write(result_str)
    self._timer.update_last_triggered_step(self._iter_count - 1)
Esempio n. 33
0
 def __init__(self, save_steps=None, save_secs=None, output_dir=""):
     self._output_tag = "step-{}"
     self._output_dir = output_dir
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)