Example #1
0
def train(train_model, pretrained_ckpt, imagenet_ckpt, checkpoint_dir,
          train_steps, summary_freq):
  """Train model."""
  vars_to_restore = None
  if pretrained_ckpt is not None:
    vars_to_restore = util.get_vars_to_save_and_restore(pretrained_ckpt)
    ckpt_path = pretrained_ckpt
  elif imagenet_ckpt:
    vars_to_restore = util.get_imagenet_vars_to_restore(imagenet_ckpt)
    ckpt_path = imagenet_ckpt
  pretrain_restorer = tf.train.Saver(vars_to_restore)
  vars_to_save = util.get_vars_to_save_and_restore()
  vars_to_save[train_model.global_step.op.name] = train_model.global_step
  saver = tf.train.Saver(vars_to_save, max_to_keep=MAX_TO_KEEP)
  sv = tf.train.Supervisor(logdir=checkpoint_dir, save_summaries_secs=0,
                           saver=None)
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  with sv.managed_session(config=config) as sess:
    # sess.run(tf.local_variables_initializer())
    if pretrained_ckpt is not None or imagenet_ckpt:
      logging.info('Restoring pretrained weights from %s', ckpt_path)
      pretrain_restorer.restore(sess, ckpt_path)

    logging.info('Attempting to resume training from %s...', checkpoint_dir)
    checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    logging.info('Last checkpoint found: %s', checkpoint)
    if checkpoint:
      saver.restore(sess, checkpoint)

    logging.info('Training...')
    start_time = time.time()
    last_summary_time = time.time()
    steps_per_epoch = train_model.reader.steps_per_epoch
    step = 1
    while step <= train_steps:
      fetches = {
          'train': train_model.train_op,
          'global_step': train_model.global_step,
          'incr_global_step': train_model.incr_global_step
      }
      if step % summary_freq == 0:
        fetches['loss'] = train_model.total_loss
        fetches['summary'] = sv.summary_op

      results = sess.run(fetches)
      global_step = results['global_step']

      if step % summary_freq == 0:
        sv.summary_writer.add_summary(results['summary'], global_step)
        train_epoch = math.ceil(global_step / steps_per_epoch)
        train_step = global_step - (train_epoch - 1) * steps_per_epoch
        this_cycle = time.time() - last_summary_time
        last_summary_time += this_cycle
        logging.info(
            'Epoch: [%2d] [%5d/%5d] time: %4.2fs (%ds total) loss: %.3f',
            train_epoch, train_step, steps_per_epoch, this_cycle,
            time.time() - start_time, results['loss'])

      if step % steps_per_epoch == 0:
        logging.info('[*] Saving checkpoint to %s...', checkpoint_dir)
        saver.save(sess, os.path.join(checkpoint_dir, 'model'),
                   global_step=global_step)

      # Setting step to global_step allows for training for a total of
      # train_steps even if the program is restarted during training.
      step = global_step + 1
Example #2
0
def train(train_model, pretrained_ckpt, imagenet_ckpt, checkpoint_dir,
          train_steps, summary_freq):
  """Train model."""
  vars_to_restore = None
  if pretrained_ckpt is not None:
    vars_to_restore = util.get_vars_to_save_and_restore(pretrained_ckpt)
    ckpt_path = pretrained_ckpt
  elif imagenet_ckpt:
    vars_to_restore = util.get_imagenet_vars_to_restore(imagenet_ckpt)
    ckpt_path = imagenet_ckpt
  pretrain_restorer = tf.train.Saver(vars_to_restore)
  vars_to_save = util.get_vars_to_save_and_restore()
  vars_to_save[train_model.global_step.op.name] = train_model.global_step
  saver = tf.train.Saver(vars_to_save, max_to_keep=MAX_TO_KEEP)
  sv = tf.train.Supervisor(logdir=checkpoint_dir, save_summaries_secs=0,
                           saver=None)
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  with sv.managed_session(config=config) as sess:
    if pretrained_ckpt is not None or imagenet_ckpt:
      logging.info('Restoring pretrained weights from %s', ckpt_path)
      pretrain_restorer.restore(sess, ckpt_path)

    logging.info('Attempting to resume training from %s...', checkpoint_dir)
    checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    logging.info('Last checkpoint found: %s', checkpoint)
    if checkpoint:
      saver.restore(sess, checkpoint)

    logging.info('Training...')
    start_time = time.time()
    last_summary_time = time.time()
    steps_per_epoch = train_model.reader.steps_per_epoch
    step = 1
    while step <= train_steps:
      fetches = {
          'train': train_model.train_op,
          'global_step': train_model.global_step,
          'incr_global_step': train_model.incr_global_step
      }
      if step % summary_freq == 0:
        fetches['loss'] = train_model.total_loss
        fetches['summary'] = sv.summary_op

      results = sess.run(fetches)
      global_step = results['global_step']

      if step % summary_freq == 0:
        sv.summary_writer.add_summary(results['summary'], global_step)
        train_epoch = math.ceil(global_step / steps_per_epoch)
        train_step = global_step - (train_epoch - 1) * steps_per_epoch
        this_cycle = time.time() - last_summary_time
        last_summary_time += this_cycle
        logging.info(
            'Epoch: [%2d] [%5d/%5d] time: %4.2fs (%ds total) loss: %.3f',
            train_epoch, train_step, steps_per_epoch, this_cycle,
            time.time() - start_time, results['loss'])

      if step % steps_per_epoch == 0:
        logging.info('[*] Saving checkpoint to %s...', checkpoint_dir)
        saver.save(sess, os.path.join(checkpoint_dir, 'model'),
                   global_step=global_step)

      # Setting step to global_step allows for training for a total of
      # train_steps even if the program is restarted during training.
      step = global_step + 1