Exemple #1
0
 def test_epochs(self):
   self.assertEqual([[0, 0]] * 5,
                    [list(x) for x in util.epochs((0 for _ in xrange(2)), 5)])
   epochs = util.epochs(xrange(5), shuffle=False)
   self.assertSequenceEqual(list(next(epochs)), xrange(5))
   self.assertSequenceEqual(list(next(epochs)), xrange(5))
   self.assertSequenceEqual(list(next(epochs)), xrange(5))
   epochs = util.epochs(xrange(5))
   self.assertSequenceEqual(list(next(epochs)), xrange(5))
   self.assertEqual(set(next(epochs)), set(xrange(5)))
   self.assertEqual(set(next(epochs)), set(xrange(5)))
Exemple #2
0
 def test_epochs(self):
     self.assertEqual(
         [[0, 0]] * 5,
         [list(x) for x in util.epochs((0 for _ in xrange(2)), 5)])
     epochs = util.epochs(xrange(5), shuffle=False)
     self.assertSequenceEqual(list(next(epochs)), xrange(5))
     self.assertSequenceEqual(list(next(epochs)), xrange(5))
     self.assertSequenceEqual(list(next(epochs)), xrange(5))
     epochs = util.epochs(xrange(5))
     self.assertSequenceEqual(list(next(epochs)), xrange(5))
     self.assertEqual(set(next(epochs)), set(xrange(5)))
     self.assertEqual(set(next(epochs)), set(xrange(5)))
Exemple #3
0
 def _run(self, supervisor, session):
   batches = (  # generates (size, feed_dict) pairs
       (len(batch), self.compiler.build_feed_dict(batch))
       for batch in util.group_by_batches(self.examples, self.batch_size))
   if self.eval_interval_secs:
     gen_batches = util.epochs(batches, shuffle=False)  # memoize batches
     max_reported_step = 0
     # Should eval for the final measurement even if _should_stop is true.
     while not (self._should_stop(supervisor) and max_reported_step > 0):
       start_time = time.time()
       if self._restore(supervisor, session):
         step = tf.train.global_step(session, self.global_step)
         if step > max_reported_step:
           max_reported_step = step
           results = self._eval_batches(
               supervisor, session, next(gen_batches), step)
           self._report_loss_and_save_best(supervisor, session, step, *results)
           if self._should_stop(supervisor): break
         else:
           self.log_and_print('not running eval because step=%s' % step)
       sleep_time = self.eval_interval_secs - (time.time() - start_time)
       if sleep_time > 0: time.sleep(sleep_time)
   elif self._restore(supervisor, session):
     step = tf.train.global_step(session, self.global_step)
     results = self._eval_batches(supervisor, session, batches, step)
     if results[0] is not None:
       self._report_loss_and_save_best(supervisor, session, step, *results)
   self.report_done()
Exemple #4
0
 def _run(self, supervisor, session):
     batches = (  # generates (size, feed_dict) pairs
         (len(batch), self.compiler.build_feed_dict(batch))
         for batch in util.group_by_batches(self.examples, self.batch_size))
     if self.eval_interval_secs:
         gen_batches = util.epochs(batches,
                                   shuffle=False)  # memoize batches
         max_reported_step = 0
         while not (self._should_stop(supervisor)
                    and max_reported_step > 0):
             start_time = time.time()
             if self._restore(supervisor, session):
                 step = tf.train.global_step(session, self.global_step)
                 if step > max_reported_step:
                     max_reported_step = step
                     results = self._eval_batches(supervisor, session,
                                                  next(gen_batches), step)
                     if results[0] is None:
                         break  # should_stop returned true
                     self._report_loss_and_save_best(
                         supervisor, session, step, *results)
                 else:
                     self.log_and_print('not running eval because step=%s' %
                                        step)
             sleep_time = self.eval_interval_secs - (time.time() -
                                                     start_time)
             if sleep_time > 0: time.sleep(sleep_time)
     elif self._restore(supervisor, session):
         step = tf.train.global_step(session, self.global_step)
         results = self._eval_batches(supervisor, session, batches, step)
         if results[0] is not None:
             self._report_loss_and_save_best(supervisor, session, step,
                                             *results)
     self.report_done()
Exemple #5
0
 def _by_feed_dict(self, feed_dict):
   """Setup for reading training data from feed dictionaries."""
   def prepare_batches(shuffled):
     for batch in util.group_by_batches(shuffled, self.batch_size,
                                        truncate=self.exact_batch_sizes):
       feed_dict[self.compiler.loom_input_tensor] = batch
       if self.compute_summaries:
         feed_dict[self.batch_size_placeholder] = len(batch)
       yield
   examples, train_size = _lazy_length(self.examples)
   loom_inputs = self.compiler.build_loom_inputs(examples)
   epochs = map(prepare_batches, util.epochs(loom_inputs, self.epochs))
   return epochs, train_size
Exemple #6
0
 def _by_feed_dict(self, feed_dict):
   """Setup for reading training data from feed dictionaries."""
   def prepare_batches(shuffled):
     for batch in util.group_by_batches(shuffled, self.batch_size,
                                        truncate=self.exact_batch_sizes):
       feed_dict[self.compiler.loom_input_tensor] = batch
       if self.compute_summaries:
         feed_dict[self.batch_size_placeholder] = len(batch)
       yield
   examples, train_size = _lazy_length(self.examples)
   loom_inputs = self.compiler.build_loom_inputs(examples)
   epochs = map(prepare_batches, util.epochs(loom_inputs, self.epochs))
   return epochs, train_size
Exemple #7
0
  def _run(self, supervisor, session):
    train_feed_dict = self.train_feeds.copy()
    train_fetches = {'train_op': self.train_op, 'loss': self.loss_total,
                     'step': self.global_step}
    if self.compute_summaries: train_fetches['summaries'] = self.summaries
    # The training loop is essentially the same regardless of whether
    # we are passing batches by feed dict or by loom input
    # tensor. There are a few minor differences:
    #
    # 1. By feed dict, we compute the size of the training set lazily,
    #    as we iterate over it in the first epoch. By input tensor, we
    #    calculate train_size as batch_size * batches_per_epoch.
    #
    # 2. By feed dict, we get the size of each batch by calling len()
    #    on it (since the last batch in the epoch may have less than
    #    batch_size elements). By input tensor, we require that every
    #    batch have exactly batch_size elements.
    #
    # 3. By feed dict we need to create batches of inputs, and feed
    #    them every time we run the train op (obviously).
    if self.examples:
      epochs, train_size = self._by_feed_dict(train_feed_dict)
    else:
      epochs, train_size = self._by_input_tensor(train_feed_dict)
    if self.dev_examples:
      # Memoize a generator of batches of (size, feed_dict) pairs.
      gen_dev_batches = util.epochs(
          ((len(batch), self.compiler.build_feed_dict(batch))
           for batch in util.group_by_batches(
               self.dev_examples, self.batch_size)), shuffle=False)
      # If there is an existing checkpoint in logdir, and we are
      # saving the best model, calculate best_loss before doing any
      # training, so we don't potentially replace a better-performing
      # checkpoint with a worse one.
      ckpt = tf.train.get_checkpoint_state(self.logdir)
      if ckpt and ckpt.model_checkpoint_path:
        _, self._best_loss, _ = self._eval_batches(
            supervisor, session, next(gen_dev_batches), None, is_dev=True)
        if self._best_loss is None: return  # should_stop returned true

    for epoch, batches in enumerate(epochs, 1):
      train_loss = 0.0
      for _ in batches:
        if self._should_stop(supervisor): return
        results = session.run(train_fetches, train_feed_dict)
        train_loss += results['loss']
        if self.compute_summaries:
          supervisor.summary_computed(
              session, results['summaries'], results['step'])
      if train_size == 0:
        raise ValueError('examples must be non-empty')
      if self.exact_batch_sizes and epoch == 1:
        if train_size < self.batch_size:
          raise ValueError('when exact_batch_sizes is true, examples must have '
                           'at least batch_size items; %s vs. %s' % (
                               train_size, self.batch_size))
        train_size -= train_size % self.batch_size
      train_loss /= train_size
      self.report_loss(results['step'], train_loss)
      log_str = 'epoch:%5d train[loss: %.3e]' % (epoch, train_loss)
      if self.dev_examples:
        dev_size, dev_loss, dev_metrics = self._eval_batches(
            supervisor, session, next(gen_dev_batches), results['step'],
            is_dev=True)
        if dev_size is None: return  # should_stop returned true
        if epoch == 1: self.log_and_print('train_size: %d dev_size: %d' %
                                          (train_size, dev_size))
        log_str += ' dev[%s]' % _eval_str(dev_size, dev_loss, dev_metrics)
        self.log_and_print(log_str)
        self._save_best(session, supervisor.saver, dev_loss, results['step'])
      else:
        if epoch == 1: self.log_and_print('train_size: %d' % train_size)
        self.log_and_print(log_str)
    if not self.dev_examples and self.is_chief_trainer:
      save_path = os.path.join(self.logdir, 'model.ckpt')
      save_fname = supervisor.saver.save(
          session, save_path, global_step=results['step'])
      self.log_and_print('final model saved in file: %s' % save_fname)
Exemple #8
0
    def _run(self, supervisor, session):
        train_feed_dict = self.train_feeds.copy()
        train_fetches = {
            'train_op': self.train_op,
            'loss': self._loss_total,
            'step': self._global_step
        }
        train_fetches['summaries'] = self._summaries
        epochs, train_size = self._by_feed_dict(train_feed_dict)
        if self.dev_examples:
            # Memoize a generator of batches of (size, feed_dict) pairs.
            gen_dev_batches = util.epochs(
                ((len(batch), self.compiler.build_feed_dict(batch)) for batch
                 in util.group_by_batches(self.dev_examples, self.batch_size)),
                shuffle=False)
            # If there is an existing checkpoint in logdir, and we are
            # saving the best model, calculate best_loss before doing any
            # training, so we don't potentially replace a better-performing
            # checkpoint with a worse one.
            ckpt = tf.train.get_checkpoint_state(self.logdir)
            if ckpt and ckpt.model_checkpoint_path:
                _, self._best_loss, _ = self._eval_batches(
                    supervisor,
                    session,
                    next(gen_dev_batches),
                    None,
                    is_dev=True)
                if self._best_loss is None: return  # should_stop returned true

        for epoch, batches in enumerate(epochs, 1):
            self.log_and_print('Starting epoch %d.' % epoch)
            train_loss = 0.0
            for (k, _) in enumerate(batches):
                results = session.run(train_fetches, train_feed_dict)
                train_loss += results['loss']
                self.log_and_print('Batch %d: loss %f' % (k, results['loss']))
                supervisor.summary_computed(session, results['summaries'],
                                            results['step'])
            if train_size == 0:
                raise ValueError('examples must be non-empty')
            train_loss /= train_size
            log_str = 'epoch:%5d train[loss: %.3e]' % (epoch, train_loss)

            if self.dev_examples:
                dev_size, dev_loss, dev_metrics = self._eval_batches(
                    supervisor,
                    session,
                    next(gen_dev_batches),
                    results['step'],
                    is_dev=True)
                if epoch == 1:
                    self.log_and_print('train_size: %d dev_size: %d' %
                                       (train_size, dev_size))
                log_str += ' dev[%s]' % _eval_str(dev_size, dev_loss,
                                                  dev_metrics)
                self._save_best(session, supervisor.saver, dev_loss,
                                results['step'])
            else:
                if epoch == 1:
                    self.log_and_print('train_size: %d' % train_size)
            self.log_and_print(log_str)

        if not self.dev_examples:
            save_path = os.path.join(self.logdir, 'model.ckpt')
            save_fname = supervisor.saver.save(session,
                                               save_path,
                                               global_step=results['step'])
            self.log_and_print('final model saved in file: %s' % save_fname)
Exemple #9
0
 def test_epochs_n_is_one(self):
   items = [1]
   result, = list(util.epochs(items, 1))
   self.assertIs(items, result)
Exemple #10
0
    def _run(self, supervisor, session):
        train_feed_dict = self.train_feeds.copy()
        train_fetches = {
            'train_op': self.train_op,
            'loss': self.loss_total,
            'step': self.global_step
        }
        if self.compute_summaries: train_fetches['summaries'] = self.summaries
        # The training loop is essentially the same regardless of whether
        # we are passing batches by feed dict or by loom input
        # tensor. There are a few minor differences:
        #
        # 1. By feed dict, we compute the size of the training set lazily,
        #    as we iterate over it in the first epoch. By input tensor, we
        #    calculate train_size as batch_size * batches_per_epoch.
        #
        # 2. By feed dict, we get the size of each batch by calling len()
        #    on it (since the last batch in the epoch may have less than
        #    batch_size elements). By input tensor, we require that every
        #    batch have exactly batch_size elements.
        #
        # 3. By feed dict we need to create batches of inputs, and feed
        #    them every time we run the train op (obviously).
        if self.examples:
            epochs, train_size = self._by_feed_dict(train_feed_dict)
        else:
            epochs, train_size = self._by_input_tensor(train_feed_dict)
        if self.dev_examples:
            # Memoize a generator of batches of (size, feed_dict) pairs.
            gen_dev_batches = util.epochs(
                ((len(batch), self.compiler.build_feed_dict(batch)) for batch
                 in util.group_by_batches(self.dev_examples, self.batch_size)),
                shuffle=False)

        for epoch, batches in enumerate(epochs, 1):
            train_loss = 0.0
            for _ in batches:
                if self._should_stop(supervisor): return
                results = session.run(train_fetches, train_feed_dict)
                train_loss += results['loss']
                if self.compute_summaries:
                    supervisor.summary_computed(session, results['summaries'],
                                                results['step'])
            if train_size == 0:
                raise ValueError('examples must be non-empty')
            if self.exact_batch_sizes and epoch == 1:
                if train_size < self.batch_size:
                    raise ValueError(
                        'when exact_batch_sizes is true, examples must have '
                        'at least batch_size items; %s vs. %s' %
                        (train_size, self.batch_size))
                train_size -= train_size % self.batch_size
            train_loss /= train_size
            self.report_loss(results['step'], train_loss)
            log_str = 'epoch:%5d train[loss: %.3e]' % (epoch, train_loss)
            if self.dev_examples:
                dev_size, dev_loss, dev_metrics = self._eval_batches(
                    supervisor,
                    session,
                    next(gen_dev_batches),
                    results['step'],
                    is_dev=True)
                if dev_size is None: return  # should_stop returned true
                if epoch == 1:
                    self.log_and_print('train_size: %d dev_size: %d' %
                                       (train_size, dev_size))
                log_str += ' dev[%s]' % _eval_str(dev_size, dev_loss,
                                                  dev_metrics)
                self.log_and_print(log_str)
                self._save_best(session, supervisor.saver, dev_loss,
                                results['step'])
            else:
                if epoch == 1:
                    self.log_and_print('train_size: %d' % train_size)
                self.log_and_print(log_str)
        if not self.dev_examples and self.is_chief_trainer:
            save_path = os.path.join(self.logdir, 'model.ckpt')
            save_fname = supervisor.saver.save(session,
                                               save_path,
                                               global_step=results['step'])
            self.log_and_print('final model saved in file: %s' % save_fname)
Exemple #11
0
 def test_epochs_n_is_one(self):
     items = [1]
     result, = list(util.epochs(items, 1))
     self.assertIs(items, result)