Example #1
0
 def remove_training_directory(self, train_dir):
     """Removes the training directory."""
     try:
         logging.info("%s: Removing existing train directory.",
                      task_as_string(self.task))
         gfile.DeleteRecursively(train_dir)
     except:
         logging.error(
             "%s: Failed to delete directory " + train_dir +
             " when starting a new model. Please delete it manually and" +
             " try again.", task_as_string(self.task))
Example #2
0
 def remove_training_directory(self, train_dir):
     """Removes the training directory."""
     if tf.gfile.Exists(train_dir):
         try:
             logging.info("{}: Removing existing train dir.".format(
                 task_as_string(self.task)))
             gfile.DeleteRecursively(train_dir)
         except:
             logging.error(
                 "{}: Failed to delete dir {} when starting a new model. Delete it manually and try again."
                 .format(task_as_string(self.task), train_dir))
Example #3
0
    def setup_model_checkpoint_and_embeddings(self):
        """Copy embeddings over and precompute theorem database embeddings.

    This function makes sure that we have at least one model checkpoint
    file present. Also it copies over the latest new embeddings when they become
    available and precomputes the embedding store for them.
    """
        logging.info('Setting up model checkpoint and embeddings %s %s',
                     str(self.config.copy_model_checkpoints),
                     str(self.checkpoint_monitor.has_checkpoint()))
        # We can prohibit copying checkpoints by setting copy_model_checkpoints
        # to false, unless we don't have any checkpoint yet, in which case
        # we try to copy a new checkpoint over.
        while self.config.copy_model_checkpoints or not (
                self.checkpoint_monitor.has_checkpoint()):
            # Whether we have a pre-existing checkpoint.
            has_checkpoint = self.checkpoint_monitor.has_checkpoint()
            logging.info('has checkpoint: %s', has_checkpoint)
            # new_checkpoint is None if the training directory does not
            # have a more recent checkpoint than the one stored in the loop
            # directory. Otherwise it refers to the current newest checkpoint.
            new_checkpoint = self.checkpoint_monitor.new_checkpoint()
            logging.info('new checkpoint: %s', new_checkpoint)
            if new_checkpoint is not None:
                # We have a more recent checkpoint than in our local directory.
                logging.info('New checkpoint: "%s"', new_checkpoint)
                self.checkpoint_monitor.copy_latest_checkpoint()
                chkpt = os.path.join(self.loop_meta.checkpoints_path(),
                                     new_checkpoint)
                logging.info('Copied checkpoint: "%s"', chkpt)
                # We try to compute embeddings until we succeed.
                while not gfile.Exists(chkpt + '.npy'):
                    runner.Runner().run(self.embedding_store_pipeline(
                        chkpt)).wait_until_finish()
                    if not gfile.Exists(chkpt + '.npy'):
                        logging.error(
                            'Could not generate embeddings for the latest '
                            'checkpoint %s.', chkpt)
                    else:
                        self.checkpoint_monitor.update_latest_checkpoint(
                            new_checkpoint)
                        break
            # If we had a pre-existing checkpoint or we managed to copy over
            # a new one, then we are succeeded. Let's not check the checkpoint
            # unless we had none.
            if has_checkpoint or self.checkpoint_monitor.has_checkpoint():
                break
            else:
                # We don't have a checkpoint and never had one. Let's wait for
                # one appear in the training directory.
                logging.info(
                    'Waiting for the first model checkpoint to appear.')
                time.sleep(10)
Example #4
0
 def remove_training_directory(self, train_dir):
   """Removes the training directory."""
   try:
     logging.info(
         "%s: Removing existing train directory.",
         task_as_string(self.task))
     gfile.DeleteRecursively(train_dir)
   except:
     logging.error(
         "%s: Failed to delete directory " + train_dir +
         " when starting a new model. Please delete it manually and" +
         " try again.", task_as_string(self.task))
Example #5
0
def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes):
    list_of_feature_names = [
        feature_names.strip() for feature_names in feature_names.split(",")
    ]
    list_of_feature_sizes = [
        int(feature_sizes) for feature_sizes in feature_sizes.split(",")
    ]
    if len(list_of_feature_names) != len(list_of_feature_sizes):
        logging.error("length of the feature names (=" +
                      str(len(list_of_feature_names)) +
                      ") != length of feature "
                      "sizes (=" + str(len(list_of_feature_sizes)) + ")")

    return list_of_feature_names, list_of_feature_sizes
Example #6
0
 def remove_training_directory(self, train_dir):
     """Removes the training directory."""
     try:
         logging.info(
             ("{}: Train dir already exist and start_new_model "
              "set to True. To restart model from scratch, "
              "delete the directory.").format(task_as_string(self.task)))
         # gfile.DeleteRecursively(train_dir)
         sys.exit()
     except:
         logging.error(
             "{}: Failed to delete directory {} when starting a new "
             "model. Please delete it manually and try again.".format(
                 task_as_string(self.task), train_dir))
         sys.exit()
Example #7
0
 def watch_threads(self):
     """Watch example queue and batch queue threads and restart if dead."""
     while True:
         time.sleep(60)
         for idx, t in enumerate(self._example_q_threads):
             if not t.is_alive():  # if the thread is dead
                 log.error('Found example queue thread dead. Restarting.')
                 new_t = Thread(target=self.fill_example_queue)
                 self._example_q_threads[idx] = new_t
                 new_t.daemon = True
                 new_t.start()
         for idx, t in enumerate(self._batch_q_threads):
             if not t.is_alive():  # if the thread is dead
                 log.error('Found batch queue thread dead. Restarting.')
                 new_t = Thread(target=self.fill_batch_queue)
                 self._batch_q_threads[idx] = new_t
                 new_t.daemon = True
                 new_t.start()
Example #8
0
def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes):
  """Extract the list of feature names and the dimensionality of each feature
     from string of comma separated values.
  Args:
    feature_names: string containing comma separated list of feature names
    feature_sizes: string containing comma separated list of feature sizes
  Returns:
    List of the feature names and list of the dimensionality of each feature.
    Elements in the first/second list are strings/integers.
  """
  list_of_feature_names = [
      feature_names.strip() for feature_names in feature_names.split(',')]
  list_of_feature_sizes = [
      int(feature_sizes) for feature_sizes in feature_sizes.split(',')]
  if len(list_of_feature_names) != len(list_of_feature_sizes):
    logging.error("length of the feature names (=" +
                  str(len(list_of_feature_names)) + ") != length of feature "
                  "sizes (=" + str(len(list_of_feature_sizes)) + ")")

  return list_of_feature_names, list_of_feature_sizes
Example #9
0
def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes):
  """Extract the list of feature names and the dimensionality of each feature
     from string of comma separated values.

  Args:
    feature_names: string containing comma separated list of feature names
    feature_sizes: string containing comma separated list of feature sizes

  Returns:
    List of the feature names and list of the dimensionality of each feature.
    Elements in the first/second list are strings/integers.
  """
  list_of_feature_names = [
      feature_names.strip() for feature_names in feature_names.split(',')]
  list_of_feature_sizes = [
      int(feature_sizes) for feature_sizes in feature_sizes.split(',')]
  if len(list_of_feature_names) != len(list_of_feature_sizes):
    logging.error("length of the feature names (=" +
                  str(len(list_of_feature_names)) + ") != length of feature "
                  "sizes (=" + str(len(list_of_feature_sizes)) + ")")

  return list_of_feature_names, list_of_feature_sizes
Example #10
0
def recover_session(self):
  # Recover session
  saver = None
  latest_checkpoint = tf.train.latest_checkpoint(self.train_dir)
  if self.config.start_new_model:
    logging.info("'start_new_model' flag is set. Removing existing train dir.")
    try:
      gfile.DeleteRecursively(self.train_dir)
    except:
      logging.error(
          "Failed to delete directory " + self.train_dir +
          " when starting a new model. Please delete it manually and" +
          " try again.")
  elif not latest_checkpoint:
    logging.info("No checkpoint file found. Building a new model.")
  else:
    meta_filename = latest_checkpoint + ".meta"
    if not gfile.Exists(meta_filename):
      logging.info("No meta graph file found. Building a new model.")
    else:
      logging.info("Restoring from meta graph file %s", meta_filename)
      saver = tf.train.import_meta_graph(meta_filename)
  return saver
Example #11
0
    def __init__(self,
                 num_classes=4800,
                 feature_sizes=[1024],
                 feature_names=["inc3"],
                 max_frames=300):
        """Construct a YT8MFrameFeatureReader.

    Args:
      num_classes: a positive integer for the number of classes.
      feature_sizes: positive integer(s) for the feature dimensions as a list.
      feature_names: the feature name(s) in the tensorflow record as a list.
      max_frames: the maximum number of frames to process.
    """

        if len(feature_names) != len(feature_sizes):
            logging.error("length of the feature names "
                          "(=" + str(len(feature_names)) + ") != length of "
                          "feature sizes (=" + str(len(feature_sizes)) + ")")

        self.num_classes = num_classes
        self.feature_sizes = feature_sizes
        self.feature_names = feature_names
        self.max_frames = max_frames
Example #12
0
    def text_generator(self, example_generator):
        """Generates article and abstract text from tf.Example.

        Args:
          example_generator: a generator of tf.Examples from file. See data.example_generator"""
        while True:
            e = next(example_generator)  # e is a tf.Example
            try:
                article_text = e.features.feature['article'].bytes_list.value[
                    0].decode(
                    )  # the article text was saved under the key 'article' in the data files
                abstract_text = e.features.feature[
                    'abstract'].bytes_list.value[0].decode(
                    )  # the abstract text was saved under the key 'abstract' in the data files
            except ValueError:
                log.error('Failed to get article or abstract from example')
                continue
            if len(
                    article_text
            ) == 0:  # See https://github.com/abisee/pointer-generator/issues/1
                log.warning(
                    'Found an example with empty article text. Skipping it.')
            else:
                yield (article_text, abstract_text)
Example #13
0
 def __init__(self,
              root: str,
              config: loop_pb2.LoopConfig,
              controller_fingerprint: int,
              read_only=None):
   self.root = os.path.join(root, config.name)
   self.config = config
   self.read_only = read_only
   self.controller_fingerprint = controller_fingerprint
   self.status = None
   self.error = None
   if self.layout_exists():
     self.error = self.check_layout()
     self.read_status()
     assert self.status, 'Could not read status %s.' % self.status_filename()
   else:
     if read_only:
       self.error = 'Non-existent loop layout at %s' % self.root
     else:
       self.status = self.new_status()
       self.make_layout()
       self.error = self.check_layout()
   if self.error is not None:
     logging.error('%s', self.error)
Example #14
0
def run_model(target, num_workers, global_step):
    ##########################
    #  Config learning_rate  #
    ##########################
    learning_rate = optimizer_utils.configure_learning_rate(
        FLAGS.num_sample_per_epoch, global_step)

    ##########################################################
    #  Config optimizer and Wrapper optimizer with PAI-Soar  #
    ##########################################################
    samples_per_step = FLAGS.batch_size
    optimizer = optimizer_utils.configure_optimizer(learning_rate)
    if FLAGS.enable_paisoar:
        import paisoar
        optimizer = paisoar.ReplicatedVarsOptimizer(
            optimizer, clip_norm=FLAGS.max_gradient_norm)
        ctx = paisoar.Config.get()
        samples_per_step *= len(ctx.device_indices) * num_workers

    #######################
    #  Config model func  #
    #######################
    model_fn = model_factory.get_model_fn(FLAGS.model_name,
                                          num_classes=FLAGS.num_classes,
                                          weight_decay=FLAGS.weight_decay,
                                          is_training=True)

    #############################
    #  Config dataset iterator  #
    #############################
    with tf.device('/cpu:0'):
        train_image_size = model_fn.default_image_size

        # split dataset by worker
        data_sources = get_tfrecord_files(
            _DATASET_TRAIN_FILES[FLAGS.dataset_name] or FLAGS.train_files,
            num_workers)

        # select the preprocessing func
        preprocessing_fn = preprocessing_factory.get_preprocessing(
            FLAGS.preprocessing_name or FLAGS.model_name,
            is_training=True) if (FLAGS.preprocessing_name
                                  or FLAGS.model_name) else None

        dataset_iterator = dataset_factory.get_dataset_iterator(
            FLAGS.dataset_name, train_image_size, preprocessing_fn,
            data_sources, FLAGS.reader)
    ###############################################
    #  Config loss_func and Wrapper with PAI-Soar #
    ###############################################
    accuracy = []

    def loss_fn():
        with tf.device('/cpu:0'):
            images, labels = dataset_iterator.get_next()
        logits, end_points = model_fn(images)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=tf.cast(
                                                          logits, tf.float32),
                                                      weights=1.0)
        if 'AuxLogits' in end_points:
            loss += tf.losses.sparse_softmax_cross_entropy(
                labels=labels,
                logits=tf.cast(end_points['AuxLogits'], tf.float32),
                weights=0.4)
        per_accuracy = tf.reduce_mean(
            tf.cast(tf.equal(tf.argmax(logits, axis=1), labels), tf.float32))
        accuracy.append(per_accuracy)
        return loss

    # wrapper loss_fn with PAI-Soar 2.0
    loss = optimizer.compute_loss(loss_fn, loss_scale=FLAGS.loss_scale) if FLAGS.enable_paisoar \
      else loss_fn()

    ########################
    #  Config train tensor #
    ########################
    train_op = optimizer.minimize(loss, global_step=global_step)

    ###############################################
    #  Log trainable or optimizer variables info, #
    #  including name and size.                   #
    ###############################################
    log_trainable_or_optimizer_vars_info()

    ################
    # Restore ckpt #
    ################
    if FLAGS.model_dir and FLAGS.task_type == 'finetune':
        utils.load_checkpoint()

    #########################
    # Config training hooks #
    #########################
    params = dict()
    if FLAGS.log_loss_every_n_iters > 0:
        tensors_to_log = {
            'loss': loss if isinstance(loss, tf.Tensor) else loss.replicas[0],
            'accuracy': tf.reduce_mean(accuracy),
            'lrate': learning_rate
        }
        params['tensors_to_log'] = tensors_to_log
        params['samples_per_step'] = samples_per_step
    hooks = get_hooks(params=params)

    ###########################
    # Kicks off the training. #
    ###########################
    logging.info('training starts.')

    with tf.train.MonitoredTrainingSession(target,
                                           is_chief=(FLAGS.task_index == 0),
                                           hooks=hooks) as sess:
        try:
            while not sess.should_stop():
                sess.run(train_op)
        except tf.errors.OutOfRangeError:
            print('All threads done.')
        except Exception as e:
            import sys
            import traceback
            logging.error(e.message)
            traceback.print_exc(file=sys.stdout)
    logging.info('training ends.')
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    if not os.path.exists(self.train_dir):
      os.makedirs(self.train_dir)

    model_flags_dict = {
        "model": FLAGS.model,
        "feature_sizes": FLAGS.feature_sizes,
        "feature_names": FLAGS.feature_names,
        "frame_features": FLAGS.frame_features,
        "label_loss": FLAGS.label_loss,
    }
    flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
    if os.path.exists(flags_json_path):
      existing_flags = json.load(open(flags_json_path))
      if existing_flags != model_flags_dict:
        logging.error("Model flags do not match existing file %s. Please "
                      "delete the file, change --train_dir, or pass flag "
                      "--start_new_model",
                      flags_json_path)
        logging.error("Ran model with flags: %s", str(model_flags_dict))
        logging.error("Previously ran with flags: %s", str(existing_flags))
        exit(1)
    else:
      # Write the file.
      with open(flags_json_path, "w") as fout:

        fout.write(json.dumps(model_flags_dict))

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:
      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):
        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        labels_origin = tf.get_collection("original_labels")[0]
        sampling_distribution = tf.get_collection("sampling_probs")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        #save_model_secs=15 * 60,
        save_model_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600),
        #save_summaries_secs=120,
        save_summaries_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600),
        saver=saver)
    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:
      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):
          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val, labels_origin_val, sampling_distribution_val = sess.run(
              [train_op, global_step, loss, predictions, labels, labels_origin, sampling_distribution])
          seconds_per_batch = time.time() - batch_start_time
          examples_per_second = labels_val.shape[0] / seconds_per_batch

          ## check on the correctness of label sampling
          #if not np.all(labels_val[0]==labels_origin_val[0]):
          #    print(np.where(labels_val[0])[0], " --- ", np.where(labels_origin_val[0])[0])
          #print(np.all(labels_val == labels_origin_val, 1).mean())

          #temp_bool = np.sum(labels_val, 1)<1
          #if np.any(temp_bool):
          #  print("\n\n\nFOUND!!", np.where(labels_val[temp_bool])[0], "\n\n\n")
          #print(sampling_distribution_val[:5, :5])


          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          #if self.is_master and global_step_val % 10 == 0 and self.train_dir:
          if self.is_master and global_step_val % FLAGS.validate_every_n_training_steps == 0 and self.train_dir:
            eval_start_time = time.time()
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_origin_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                      labels_origin_val)
            gap = eval_util.calculate_gap(predictions_val, labels_origin_val)
            eval_end_time = time.time()
            eval_time = eval_end_time - eval_start_time

            logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
              " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
              ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
              " GAP: " + ("%.2f" % gap))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            with open(FLAGS.train_dir + '/global_step_{%d}_training_GAP_{%.6f}.txt' % (global_step_val, gap), 'w') as f:
              f.write('\n')

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or
                (global_step_val - self.last_model_export_step
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val
          else:
            #logging.info("training step " + str(global_step_val) + " | Loss: " +
              #("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
            continue
      except tf.errors.OutOfRangeError:
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Example #16
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        logging.info(
            '############## PARAMETERS ##############################')
        logging.info("feature_names {}".format(FLAGS.feature_names))
        logging.info("feature_sizes {}".format(FLAGS.feature_sizes))
        logging.info("frame_features {}".format(FLAGS.frame_features))
        logging.info("model {}".format(FLAGS.model))
        logging.info("start_new_model {}".format(FLAGS.start_new_model))
        logging.info("num_gpu {}".format(FLAGS.num_gpu))
        logging.info("batch_size {}".format(FLAGS.batch_size))
        logging.info("label_loss {}".format(FLAGS.label_loss))
        logging.info("regularization_penalty {}".format(
            FLAGS.regularization_penalty))
        logging.info("base_learning_rate {}".format(FLAGS.base_learning_rate))
        logging.info("learning_rate_decay {}".format(
            FLAGS.learning_rate_decay))
        logging.info("learning_rate_decay_examples {}".format(
            FLAGS.learning_rate_decay_examples))
        logging.info("num_epochs {}".format(FLAGS.num_epochs))
        logging.info("max_steps {}".format(FLAGS.max_steps))
        logging.info("export_model_steps {}".format(FLAGS.export_model_steps))
        logging.info("num_readers {}".format(FLAGS.num_readers))
        logging.info("optimizer {}".format(FLAGS.optimizer))
        logging.info("clip_gradient_norm {}".format(FLAGS.clip_gradient_norm))
        logging.info(
            '########################################################')
        logging.info(' '.join([x for x in sys.argv]))

        model_flags_dict = {
            "model": FLAGS.model,
            "feature_sizes": FLAGS.feature_sizes,
            "feature_names": FLAGS.feature_names,
            "frame_features": FLAGS.frame_features,
            "label_loss": FLAGS.label_loss,
        }
        flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
        if os.path.exists(flags_json_path):
            existing_flags = json.load(open(flags_json_path))
            if existing_flags != model_flags_dict:
                logging.error(
                    "Model flags do not match existing file %s. Please "
                    "delete the file, change --train_dir, or pass flag "
                    "--start_new_model", flags_json_path)
                logging.error("Ran model with flags: %s",
                              str(model_flags_dict))
                logging.error("Previously ran with flags: %s",
                              str(existing_flags))
                exit(1)
        else:
            # Write the file.
            with open(flags_json_path, "w") as fout:
                fout.write(json.dumps(model_flags_dict))

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            with tf.device(device_fn):

                saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]

                init_op, init_fn = None, None
                if meta_filename:
                    saver = tf.train.Saver(tf.global_variables(),
                                           max_to_keep=0,
                                           keep_checkpoint_every_n_hours=0.25)

                    def init_fn(sess):
                        return saver.restore(sess, meta_filename)
                else:
                    init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 init_fn=init_fn,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=40 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % 10 == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     " PERR: " + ("%.2f" % perr) + " GAP: " +
                                     ("%.2f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Example #17
0
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    if not os.path.exists(self.train_dir):
      os.makedirs(self.train_dir)

    model_flags_dict = {
        "model": FLAGS.model,
        "feature_sizes": FLAGS.feature_sizes,
        "feature_names": FLAGS.feature_names,
        "frame_features": FLAGS.frame_features,
        "label_loss": FLAGS.label_loss,
    }
    flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
    if os.path.exists(flags_json_path):
      existing_flags = json.load(open(flags_json_path))
      if existing_flags != model_flags_dict:
        logging.error("Model flags do not match existing file %s. Please "
                      "delete the file, change --train_dir, or pass flag "
                      "--start_new_model",
                      flags_json_path)
        logging.error("Ran model with flags: %s", str(model_flags_dict))
        logging.error("Previously ran with flags: %s", str(existing_flags))
        exit(1)
    else:
      # Write the file.
      with open(flags_json_path, "w") as fout:
        fout.write(json.dumps(model_flags_dict))

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:
      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):
        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()
        if FLAGS.ema_source:
            # Here the variables still exsist
            ema_op = tf.get_collection("ema_op")[0]
            def_vars = tf.get_collection("updatable_vars")
            ema_vars = tf.get_collection("ema_vars")

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        save_model_secs=15 * 60,
        save_summaries_secs=120,
        saver=saver)

    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:

      if FLAGS.ema_source:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        sess.graph._unsafe_unfinalize()
        ckpt_reader = pywrap_tensorflow.NewCheckpointReader(FLAGS.ema_source)
        for xtensor, ematensor in zip(def_vars, ema_vars):
            src_tensor = ckpt_reader.get_tensor(xtensor.name.split(":")[0])
            # Loading does not take up graoh space
            xtensor.load(src_tensor, session=sess)
            ematensor.load(src_tensor, session=sess)
            # sess.run(tf.assign(xtensor, src_tensor))
            # sess.run(tf.assign(ematensor, src_tensor))
      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):
          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
              [train_op, global_step, loss, predictions, labels])
          if FLAGS.ema_source:  # Update EMA if needed
              _ = sess.run(ema_op)

          seconds_per_batch = time.time() - batch_start_time
          examples_per_second = labels_val.shape[0] / seconds_per_batch

          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          if self.is_master and global_step_val % 10 == 0 and self.train_dir:
            eval_start_time = time.time()
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                      labels_val)
            gap = eval_util.calculate_gap(predictions_val, labels_val)
            eval_end_time = time.time()
            eval_time = eval_end_time - eval_start_time

            logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
              " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
              ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
              " GAP: " + ("%.2f" % gap))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or
                (global_step_val - self.last_model_export_step
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              # self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val
          else:
            logging.info("training step " + str(global_step_val) + " | Loss: " +
              ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
      except tf.errors.OutOfRangeError:
        save_name = "{0}/model.ckpt".format(self.train_dir)
        saver.save(sess, save_name, global_step_val)
        logging.info("Final model export.")
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Example #18
0
    def fit(self,
            train_data_pipeline,
            start_new_model=False,
            tr_data_fn=None,
            tr_data_paras=None,
            validate_set=None,
            validate_fn=None,
            bootstrap=False,
            init_learning_rate=0.01,
            decay_steps=40000,
            decay_rate=0.95,
            epochs=None,
            l1_reg_rate=None,
            l2_reg_rate=0.01,
            pos_weights=None,
            initial_weights=None,
            initial_biases=None):
        """
        Logistic regression fit function.
        Args:
            train_data_pipeline: A namedtuple consisting of reader, data_pattern, batch_size and num_readers.
            start_new_model: If True, start a new model instead of restoring from existing checkpoints.
            tr_data_fn: a function that transforms input data.
            tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary.
            validate_set: If not None, check validation loss regularly. Else, ignored.
            validate_fn: The function to check the performance of learned model parameters on validate set.
            bootstrap: If True, sampling training examples with replacement by differential weighting.
            init_learning_rate: Decayed gradient descent parameter.
            decay_steps: Decayed gradient descent parameter.
            decay_rate: Decayed gradient descent parameter.
            epochs: Maximal epochs to use.
            l1_reg_rate: None, not impose l1 regularization.
            l2_reg_rate: l2 regularization rate.
            pos_weights: For imbalanced binary classes. Here, num_pos << num_neg, the weights should be > 1.0.
                If None, treated as 1.0 for all binary classifiers.
            initial_weights: If not None, the weights will be initialized with it.
            initial_biases: If not None, the biases will be initialized with it.
        Returns: None.
        """
        reader = train_data_pipeline.reader
        batch_size = train_data_pipeline.batch_size
        num_classes = reader.num_classes
        feature_names = reader.feature_names
        feature_sizes = reader.feature_sizes
        logging.info(
            'Logistic regression uses {} features with dims {}.'.format(
                feature_names, feature_sizes))

        raw_feature_size = sum(feature_sizes)

        self.train_data_pipeline = train_data_pipeline
        self.raw_feature_size = raw_feature_size
        self.feature_size = raw_feature_size
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.tr_data_fn = tr_data_fn
        self.tr_data_paras = tr_data_paras
        self.bootstrap = bootstrap
        self.init_learning_rate = init_learning_rate
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate
        self.epochs = epochs
        self.l1_reg_rate = l1_reg_rate
        self.l2_reg_rate = l2_reg_rate
        self.pos_weights = pos_weights
        self.initial_weights = initial_weights
        self.initial_biases = initial_biases

        # Check extra data transform function arguments.
        # If transform changes the features size, change it.
        if self.tr_data_fn is not None:
            if self.tr_data_paras is None:
                self.tr_data_paras = dict()
            else:
                if ('reshape' in self.tr_data_paras) and (
                        self.tr_data_paras['reshape'] is True):
                    self.feature_size = self.tr_data_paras['size']
                    logging.warn(
                        'Data transform changes the features size to {}.'.
                        format(self.feature_size))

            logging.debug('Data transform arguments are {}.'.format(
                self.tr_data_paras))
        else:
            self.tr_data_paras = dict()

        start_new_model = start_new_model or (not tf.gfile.Exists(self.logdir))

        # This is NECESSARY to avoid contaminating default graph.
        # Alternatively, we can define a member graph variable. When building a new graph or
        # restoring a graph, wrap the code into a similar contextmanager.
        self.graph = tf.Graph()
        with self.graph.as_default():
            if start_new_model:
                logging.info('Starting a new model...')
                # Start new model, delete existing checkpoints.
                if tf.gfile.Exists(self.logdir):
                    try:
                        tf.gfile.DeleteRecursively(self.logdir)
                    except tf.errors.OpError:
                        logging.error('Failed to delete dir {}.'.format(
                            self.logdir))
                    else:
                        logging.info(
                            'Succeeded to delete train dir {}.'.format(
                                self.logdir))
                else:
                    # Do nothing.
                    pass

                # Build graph, namely building a graph and initialize member variables associated with graph.
                self.saver = self._build_graph()
            else:
                self.saver = self._restore_graph()

            # After either building a graph or restoring a graph, graph is CONSTRUCTED successfully.
            # Get collections to be used in training.
            self.global_step = tf.get_collection('global_step')[0]
            self.init_op = tf.get_collection('init_op')[0]
            self.train_op = tf.get_collection('train_op')[0]
            self.summary_op = tf.get_collection('summary_op')[0]
            self.raw_features_batch = tf.get_collection(
                'raw_features_batch')[0]
            self.labels_batch = tf.get_collection('labels_batch')[0]
            self.loss = tf.get_collection('loss')[0]
            self.pred_prob = tf.get_collection('predictions')[0]

        if self._check_graph_initialized():
            logging.info('Succeeded to initialize logistic regression Graph.')
        else:
            logging.error('Failed to initialize logistic regression Graph.')

        # Start or restore training.
        # To avoid summary causing memory usage peak, manually save summaries.
        sv = tf.train.Supervisor(graph=self.graph,
                                 init_op=self.init_op,
                                 logdir=self.logdir,
                                 global_step=self.global_step,
                                 summary_op=None,
                                 save_model_secs=600,
                                 saver=self.saver)

        with sv.managed_session() as sess:
            logging.info("Entering training loop...")
            for step in range(self.max_train_steps):
                if sv.should_stop():
                    # Save the final model and break.
                    self.saver.save(sess,
                                    save_path='{}_{}'.format(
                                        sv.save_path, 'final'))
                    break

                if step % 500 == 0:
                    if validate_fn is not None:
                        _, summary, train_pred_prob_batch, train_labels_batch, global_step_val = sess.run(
                            [
                                self.train_op, self.summary_op, self.pred_prob,
                                self.labels_batch, self.global_step
                            ])

                        # Evaluate on train data.
                        train_per = validate_fn(
                            predictions=train_pred_prob_batch,
                            labels=train_labels_batch)
                        sv.summary_writer.add_summary(
                            MakeSummary(
                                'train/{}'.format(validate_fn.func_name),
                                train_per), global_step_val)
                        logging.info('Step {}, train {}: {}.'.format(
                            global_step_val, validate_fn.func_name, train_per))
                    else:
                        _, summary, global_step_val = sess.run(
                            [self.train_op, self.summary_op, self.global_step])

                    # Add train summary.
                    sv.summary_computed(sess,
                                        summary,
                                        global_step=global_step_val)

                    # Compute validate loss and performance (validate_fn).
                    if validate_set is not None:
                        validate_data, validate_labels = validate_set

                        # Compute validation loss.
                        num_validate_videos = validate_data.shape[0]
                        split_indices = np.linspace(
                            0,
                            num_validate_videos + 1,
                            num=max(
                                num_validate_videos // (2 * batch_size) + 1,
                                2),
                            dtype=np.int32)

                        validate_loss_vals, predictions = [], []
                        for i in range(len(split_indices) - 1):
                            start_ind = split_indices[i]
                            end_ind = split_indices[i + 1]

                            if validate_fn is not None:
                                ith_validate_loss_val, ith_predictions = sess.run(
                                    [self.loss, self.pred_prob],
                                    feed_dict={
                                        self.raw_features_batch:
                                        validate_data[start_ind:end_ind],
                                        self.labels_batch:
                                        validate_labels[start_ind:end_ind]
                                    })

                                validate_loss_vals.append(
                                    ith_validate_loss_val *
                                    (end_ind - start_ind))
                                predictions.append(ith_predictions)
                            else:
                                ith_validate_loss_val = sess.run(
                                    self.loss,
                                    feed_dict={
                                        self.raw_features_batch:
                                        validate_data[start_ind:end_ind],
                                        self.labels_batch:
                                        validate_labels[start_ind:end_ind]
                                    })

                                validate_loss_vals.append(
                                    ith_validate_loss_val *
                                    (end_ind - start_ind))

                        validate_loss_val = sum(
                            validate_loss_vals) / num_validate_videos
                        # Add validate summary.
                        sv.summary_writer.add_summary(
                            MakeSummary('validate/xentropy',
                                        validate_loss_val), global_step_val)

                        if validate_fn is not None:
                            validate_per = validate_fn(
                                predictions=np.concatenate(predictions,
                                                           axis=0),
                                labels=validate_labels)

                            sv.summary_writer.add_summary(
                                MakeSummary(
                                    'validate/{}'.format(
                                        validate_fn.func_name), validate_per),
                                global_step_val)
                            logging.info('Step {}, validate {}: {}.'.format(
                                global_step_val, validate_fn.func_name,
                                validate_per))

                elif step % 200 == 0:
                    _, summary, global_step_val = sess.run(
                        [self.train_op, self.summary_op, self.global_step])
                    sv.summary_computed(sess,
                                        summary,
                                        global_step=global_step_val)
                else:
                    sess.run(self.train_op)

            logging.info("Exited training loop.")

        # Session will close automatically when with clause exits.
        # sess.close()
        sv.stop()
Example #19
0
from ffn.inference import movement
from ffn.training import mask
from ffn.training.import_util import import_symbol
from ffn.training import inputs
from ffn.training import augmentation
# Necessary so that optimizer flags are defined.
# pylint: disable=unused-import
from ffn.training import optimizer
# pylint: enable=unused-import
try:
  import horovod.tensorflow as hvd
  horovodworks = True
except Exception as e:
  horovodworks = False
  logging.error('Horovod not found')
  logging.error(e)
  class hvd():
    __version__='0.0.0'
    def rank():
        return 0
    def local_rank():
        return 0
    def size():
        return 1
    def init():
        pass
FLAGS = flags.FLAGS

# Options related to training data.
flags.DEFINE_string('train_coords', None,
Example #20
0
def main(unused_argv):
    """
    Train the rbf network.
    """
    logging.set_verbosity(logging.INFO)

    start_new_model = FLAGS.start_new_model
    output_dir = FLAGS.output_dir

    # The ratio of examples to sample as centers (prototypes).
    num_centers_ratio = FLAGS.num_centers_ratio
    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    # distance metric, cosine or euclidean.
    dist_metric = FLAGS.dist_metric
    init_with_linear_clf = FLAGS.init_with_linear_clf

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    train_epochs = FLAGS.train_epochs
    l1_reg_rate = FLAGS.l1_reg_rate
    l2_reg_rate = FLAGS.l2_reg_rate

    # ....Start rbf network...
    logging.info('Entering rbf network...')
    # Validate set is not stored in graph or meta data. Re-create it any way.
    # Sample validate set for logistic regression early stopping.
    validate_data_pipeline = DataPipeline(reader=reader,
                                          data_pattern=validate_data_pattern,
                                          batch_size=batch_size,
                                          num_readers=num_readers)

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Sample validate set.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline,
            name_scope='sample_validate')
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    # DataPipeline consists of reader, batch size, no. of readers and data pattern.
    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)

    # If start a new model or output dir does not exist, truly start a new model.
    start_new_model = start_new_model or (not tf.gfile.Exists(output_dir))

    if start_new_model:
        # PHASE ONE - selecting prototypes c, computing scaling factors sigma.
        # num_centers = FLAGS.num_centers
        # num_centers_ratio = float(num_centers) / NUM_TRAIN_EXAMPLES

        # metric is euclidean or cosine. If cosine, alpha=1.0, otherwise can be less than 1.0.
        if 'cosine' == dist_metric:
            # 200 will lead to decreasing drastically and increasing slowly.
            alpha = 1.0
        else:
            alpha = 1.0
        centers, sigmas = initialize(num_centers_ratio,
                                     data_pipeline=train_data_pipeline,
                                     method='kmeans',
                                     metric=dist_metric,
                                     scaling_method=4,
                                     alpha=alpha)

        # PHASE TWO - computing linear regression weights and biases.
        num_centers = centers.shape[0]
        # Compute mean and variance after data transform.
        tr_data_fn = rbf_transform
        tr_data_paras = {
            'centers': centers,
            'sigmas': sigmas,
            'metric': dist_metric,
            'reshape': True,
            'size': num_centers
        }
        """
        # Include standard scale to rbf transform.
        tr_data_mean, tr_data_var = compute_data_mean_var(train_data_pipeline,
                                                          tr_data_fn=tr_data_fn,
                                                          tr_data_paras=tr_data_paras)
        logging.debug('tr_data_mean: {}\ntr_data_var: {}'.format(tr_data_mean, tr_data_var))
        tr_data_paras.update({'mean': tr_data_mean, 'variance': tr_data_var})
        """
        if init_with_linear_clf:
            # Call linear classification to get a good initial values of weights and biases.
            linear_clf = LinearClassifier(
                logdir=path_join(output_dir, 'linear_classifier'))
            linear_clf.fit(data_pipeline=train_data_pipeline,
                           tr_data_fn=tr_data_fn,
                           tr_data_paras=tr_data_paras,
                           l2_regs=[
                               0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
                               1.0, 10.0, 100.0, 1000.0
                           ],
                           validate_set=(validate_data, validate_labels),
                           line_search=True)
            linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases
        else:
            linear_clf_weights, linear_clf_biases = None, None

        # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
        try:
            # Load sum_labels in training set, numpy float format to compute pos_weights.
            train_sum_labels = load_sum_labels()
            # num_neg / num_pos, assuming neg_weights === 1.0.
            pos_weights = np.sqrt(
                float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0)
            logging.info(
                'Computing pos_weights based on sum_labels in train set successfully.'
            )
        except IOError:
            logging.error('Cannot load train sum_labels. Use default value.')
            pos_weights = None
        finally:
            pos_weights = None
    else:
        linear_clf_weights, linear_clf_biases = None, None
        tr_data_fn, tr_data_paras = None, None
        pos_weights = None

    # PHASE THREE - fine tuning prototypes c, scaling factors sigma and weights and biases.
    log_reg_clf = LogisticRegression(logdir=path_join(output_dir, 'log_reg'))
    log_reg_clf.fit(train_data_pipeline=train_data_pipeline,
                    start_new_model=start_new_model,
                    tr_data_fn=tr_data_fn,
                    tr_data_paras=tr_data_paras,
                    validate_set=(validate_data, validate_labels),
                    validate_fn=gap_fn,
                    init_learning_rate=init_learning_rate,
                    decay_steps=decay_steps,
                    decay_rate=decay_rate,
                    epochs=train_epochs,
                    l1_reg_rate=l1_reg_rate,
                    l2_reg_rate=l2_reg_rate,
                    pos_weights=pos_weights,
                    initial_weights=linear_clf_weights,
                    initial_biases=linear_clf_biases)

    # ....Exit rbf network...
    logging.info('Exit rbf network.')
Example #21
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    start_new_model = FLAGS.start_new_model
    output_dir = FLAGS.output_dir

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    l1_reg_rate = FLAGS.l1_reg_rate
    l2_reg_rate = FLAGS.l2_reg_rate
    is_bootstrap = FLAGS.is_bootstrap
    train_epochs = FLAGS.train_epochs

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Increase num_readers.
        validate_data_pipeline = DataPipeline(
            reader=reader,
            data_pattern=validate_data_pattern,
            batch_size=batch_size,
            num_readers=num_readers)

        # Sample validate set.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline,
            name_scope='sample_validate')
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)

    model_save_path = path_join(output_dir, 'mlp_fuse')
    if start_new_model and tf.gfile.Exists(model_save_path):
        logging.info('Starting a new model...')
        # Start new model, delete existing checkpoints.
        try:
            tf.gfile.DeleteRecursively(model_save_path)
        except tf.errors.OpError:
            logging.error('Failed to delete dir {}.'.format(model_save_path))
        else:
            logging.info(
                'Succeeded to delete train dir {}.'.format(model_save_path))

    # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
    try:
        # Load sum_labels in training set, numpy float format to compute pos_weights.
        train_sum_labels = load_sum_labels()
        # num_neg / num_pos, assuming neg_weights === 1.0.
        pos_weights = np.sqrt(
            float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0)
        logging.info(
            'Computing pos_weights based on sum_labels in train set successfully.'
        )
    except IOError:
        logging.error('Cannot load train sum_labels. Use default value.')
        pos_weights = None
    finally:
        logging.warn('Not to use positive weights.')
        pos_weights = None

    train(train_data_pipeline,
          epochs=train_epochs,
          pos_weights=pos_weights,
          l1_reg_rate=l1_reg_rate,
          l2_reg_rate=l2_reg_rate,
          init_learning_rate=init_learning_rate,
          bootstrap=is_bootstrap,
          validate_set=(validate_data, validate_labels),
          validate_fn=gap_fn,
          logdir=model_save_path)
Example #22
0
    arg_parser.add_argument("--preprocess", "-p", type=bool, default=False)
    arg_parser.add_argument("--data", "-d", type=str, default="./data")

    args = arg_parser.parse_args()
    mode = args.mode
    if_preprocess = args.preprocess
    data_dir = args.data

    logging.set_verbosity(logging.INFO)

    train_x, train_y, test_x, train_data, test_data = load_data(data_dir=data_dir, if_preprocess=if_preprocess)

    logging.info("building model...")
    model = attention()
    restored = model.restore()

    if mode == "train":
        logging.info("training...")
        model.train(train_x, train_y,epochs=100,batch_size=150)
    elif mode == "evaluate":
        logging.info("evaluating...")
        if restored:
            for name, value in model.evaluate(train_x,train_y,batch_size=150):
                print("name: %s, value: %f" % (name, value))
        else:
            logging.error("error: model weights not exist!")
    elif mode == "submit":
        logging.info("predicting final result...")
        test_data[LABEL_LIST] = model.predict(test_x, batch_size=150)
        test_data = test_data[["id"]+LABEL_LIST]
        test_data.to_csv("submission.csv", index=False)
Example #23
0
        if msg == 'reset':
            for data_path in paths:
                filename = os.path.split(data_path)[1]
                filepath = os.path.join(data_dir, filename)
                if os.path.exists(filepath):
                    logging.debug(filename + " all readly exist in local")
                    data_msg_q.put_data(filepath)
                    logging.debug("reuse local data " + filepath + " done")
                    continue
                count, ret_code = 0, -1
                command = ["hadoop", "fs", "-get", data_path, data_dir]
                while count < retry_times and ret_code != 0:
                    try:
                        ret_code = subprocess.check_call(command)
                    except subprocess.CalledProcessError, e:
                        logging.error(e)
                        count += 1

                if ret_code == 0:
                    data_msg_q.put_data(filepath)
                    logging.debug("fetch " + data_path + " done")
                else:
                    logging.debug("fetch " + data_path + " failed")

            data_msg_q.put_data(None)
        elif msg == 'shuffle':
            random.shuffle(paths)
        elif msg == 'Done':
            break
        msg = data_msg_q.get_msg()
Example #24
0
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    if not os.path.exists(self.train_dir):
      os.makedirs(self.train_dir)

    model_flags_dict = {
        "model": FLAGS.model,
        "feature_sizes": FLAGS.feature_sizes,
        "feature_names": FLAGS.feature_names,
        "frame_features": FLAGS.frame_features,
        "label_loss": FLAGS.label_loss,
    }
    flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
    if os.path.exists(flags_json_path):
      existing_flags = json.load(open(flags_json_path))
      if existing_flags != model_flags_dict:
        logging.error("Model flags do not match existing file %s. Please "
                      "delete the file, change --train_dir, or pass flag "
                      "--start_new_model",
                      flags_json_path)
        logging.error("Ran model with flags: %s", str(model_flags_dict))
        logging.error("Previously ran with flags: %s", str(existing_flags))
        exit(1)
    else:
      # Write the file.
      with open(flags_json_path, "w") as fout:
        fout.write(json.dumps(model_flags_dict))

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:
      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):
        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        save_model_secs=15 * 60,
        save_summaries_secs=120,
        saver=saver)

    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:
      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):
          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
              [train_op, global_step, loss, predictions, labels])
          seconds_per_batch = time.time() - batch_start_time
          examples_per_second = labels_val.shape[0] / seconds_per_batch

          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          if self.is_master and global_step_val % 10 == 0 and self.train_dir:
            eval_start_time = time.time()
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                      labels_val)
            gap = eval_util.calculate_gap(predictions_val, labels_val)
            eval_end_time = time.time()
            eval_time = eval_end_time - eval_start_time

            logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
              " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
              ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
              " GAP: " + ("%.2f" % gap))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or
                (global_step_val - self.last_model_export_step
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val
          else:
            logging.info("training step " + str(global_step_val) + " | Loss: " +
              ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
      except tf.errors.OutOfRangeError:
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Example #25
0
def main(unused_argv):
    """
    Training.
    init_learning_rate: Initial learning rate.
    decay_steps: How many training steps to decay learning rate once.
    decay_rate: How much to decay learning rate.
    l2_reg_rate: l2 regularization rate.
    epochs: The maximal epochs to pass all training data.
    """
    logging.set_verbosity(logging.INFO)

    output_dir = FLAGS.output_dir
    start_new_model = FLAGS.start_new_model

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    l2_reg_rate = FLAGS.l2_reg_rate
    train_epochs = FLAGS.train_epochs

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers
    init_with_linear_clf = FLAGS.init_with_linear_clf
    is_bootstrap = FLAGS.is_bootstrap

    # Increase num_readers.
    validate_data_pipeline = DataPipeline(reader=reader,
                                          data_pattern=validate_data_pattern,
                                          batch_size=batch_size,
                                          num_readers=num_readers)

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Sample validate set for line search in linear classifier or logistic regression early stopping.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline)
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    start_new_model = start_new_model or (not tf.gfile.Exists(output_dir))

    # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
    try:
        # Load sum_labels in training set, numpy float format to compute pos_weights.
        train_sum_labels = load_sum_labels()
        # num_neg / num_pos, assuming neg_weights === 1.0.
        pos_weights = np.sqrt(
            (float(NUM_TRAIN_EXAMPLES) - train_sum_labels) / train_sum_labels)
        logging.info(
            'Computing pos_weights based on sum_labels in train set successfully.'
        )
    except IOError:
        logging.error('Cannot load train sum_labels. Use default value.')
        pos_weights = None
    finally:
        logging.error('Disable pos_weights.')
        # Set it as None to disable pos_weights.
        pos_weights = None

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)
    if start_new_model:
        # Load train data mean and std.
        train_features_mean, train_features_var = load_features_mean_var(
            reader)

        tr_data_fn = standard_scale
        tr_data_paras = {
            'mean': train_features_mean,
            'variance': train_features_var,
            'reshape': False,
            'size': None
        }

        if init_with_linear_clf:
            # ...Start linear classifier...
            # Compute weights and biases of linear classifier using normal equation.
            # Linear search helps little.
            linear_clf = LinearClassifier(
                logdir=path_join(output_dir, 'linear_classifier'))
            linear_clf.fit(data_pipeline=train_data_pipeline,
                           tr_data_fn=tr_data_fn,
                           tr_data_paras=tr_data_paras,
                           l2_regs=[
                               0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
                               1.0, 10.0, 100.0, 1000.0
                           ],
                           validate_set=(validate_data, validate_labels),
                           line_search=True)
            linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases

            logging.info(
                'linear classifier weights and biases with shape {}, {}'.
                format(linear_clf_weights.shape, linear_clf_biases.shape))
            logging.debug(
                'linear classifier weights and {} biases: {}.'.format(
                    linear_clf_weights, linear_clf_biases))
            # ...Exit linear classifier...
        else:
            linear_clf_weights, linear_clf_biases = None, None
    else:
        linear_clf_weights, linear_clf_biases = None, None
        tr_data_fn = None
        tr_data_paras = None

    # Run logistic regression.
    log_reg = LogisticRegression(logdir=path_join(output_dir, 'log_reg'))
    log_reg.fit(train_data_pipeline,
                start_new_model=start_new_model,
                tr_data_fn=tr_data_fn,
                tr_data_paras=tr_data_paras,
                validate_set=(validate_data, validate_labels),
                validate_fn=gap_fn,
                bootstrap=is_bootstrap,
                init_learning_rate=init_learning_rate,
                decay_steps=decay_steps,
                decay_rate=decay_rate,
                epochs=train_epochs,
                l2_reg_rate=l2_reg_rate,
                pos_weights=pos_weights,
                initial_weights=linear_clf_weights,
                initial_biases=linear_clf_biases)
Example #26
0
def main(unused_argv):
  logging.set_verbosity(tf.logging.INFO)
  print("tensorflow version: %s" % tf.__version__)
  is_chief = (FLAGS.task == 0)

  # Recover session
  saver = None
  latest_checkpoint = tf.train.latest_checkpoint(FLAGS.train_dir)
  if FLAGS.start_new_model:
    logging.info("'start_new_model' flag is set. Removing existing train dir.")
    try:
      gfile.DeleteRecursively(FLAGS.train_dir)
    except:
      logging.error(
          "Failed to delete directory " + FLAGS.train_dir +
          " when starting a new model. Please delete it manually and" +
          " try again.")
  elif not latest_checkpoint:
    logging.info("No checkpoint file found. Building a new model.")
  else:
    meta_filename = latest_checkpoint + ".meta"
    if not gfile.Exists(meta_filename):
      logging.info("No meta graph file found. Building a new model.")
    else:
      logging.info("Restoring from meta graph file %s", meta_filename)
      saver = tf.train.import_meta_graph(meta_filename)

  if not saver:
    # convert feature_names and feature_sizes to lists of values
    feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
        FLAGS.feature_names, FLAGS.feature_sizes)

    if FLAGS.frame_features:
      reader = readers.YT8MFrameFeatureReader(
          feature_names=feature_names,
          feature_sizes=feature_sizes)
    else:
      reader = readers.YT8MAggregatedFeatureReader(
          feature_names=feature_names,
          feature_sizes=feature_sizes)

    model = find_class_by_name(FLAGS.model,
        [frame_level_models, video_level_models])()
    label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
    optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train])
    build_graph(reader=reader,
                model=model,
                optimizer_class=optimizer_class,
                clip_gradient_norm=FLAGS.clip_gradient_norm,
                train_data_pattern=FLAGS.train_data_pattern,
                label_loss_fn=label_loss_fn,
                base_learning_rate=FLAGS.base_learning_rate,
                learning_rate_decay=FLAGS.learning_rate_decay,
                learning_rate_decay_examples=FLAGS.learning_rate_decay_examples,
                regularization_penalty=FLAGS.regularization_penalty,
                num_readers=FLAGS.num_readers,
                batch_size=FLAGS.batch_size,
                num_epochs=FLAGS.num_epochs)
    logging.info("built graph")
    saver = tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=0.25)

  train_loop(is_chief=is_chief,
             train_dir=FLAGS.train_dir,
             saver=saver,
             master=FLAGS.master)
Example #27
0
def define_data_input(model, queue_batch=None):
  """Adds TF ops to load input data."""

  label_volume_map = {}
  for vol in FLAGS.label_volumes.split(','):
    volname, path, dataset = vol.split(':')
    label_volume_map[volname] = h5py.File(path)[dataset]

  image_volume_map = {}
  for vol in FLAGS.data_volumes.split(','):
    volname, path, dataset = vol.split(':')
    image_volume_map[volname] = h5py.File(path)[dataset]

  if queue_batch is None:
    queue_batch = FLAGS.batch_size

  # Fetch sizes of images and labels
  label_size = train_labels_size(model)
  image_size = train_image_size(model)

  label_radii = (label_size // 2).tolist()
  label_size = label_size.tolist()
  image_radii = (image_size // 2).tolist()
  image_size = image_size.tolist()

  # Fetch a single coordinate and volume name from a queue reading the
  # coordinate files or from saved hard/important examples
  import os.path
  if os.path.isfile(FLAGS.train_coords):
    logging.info('{} exists.'.format(FLAGS.train_coords))
  else:
    logging.error('{} does not exist.'.format(FLAGS.train_coords))
  if FLAGS.sharding_rule == 0:
    coord, volname = inputs.load_patch_coordinates(FLAGS.train_coords)
  elif FLAGS.sharding_rule == 1 and 'horovod' in sys.modules:
    d = tf.data.TFRecordDataset(FLAGS.train_coords, compression_type='GZIP')
    d = d.shard(hvd.size(), hvd.rank())
    d = d.map(parser_fn)
    iterator = d.make_one_shot_iterator()
    coord, volname = iterator.get_next()
  else:
    logging.warning("You need to install Horovod to use sharding. Turning sharding off..")
    FLAGS.sharding_rule = 0
    coord, volname = inputs.load_patch_coordinates(FLAGS.train_coords)

 # Load object labels (segmentation).
  labels = inputs.load_from_numpylike(
      coord, volname, label_size, label_volume_map)

  label_shape = [1] + label_size[::-1] + [1]
  #label_shape = [1] + [1] + label_size[::-1] # NCDHW
  labels = tf.reshape(labels, label_shape)

  loss_weights = tf.constant(np.ones(label_shape, dtype=np.float32))

  # Load image data.
  patch = inputs.load_from_numpylike(
      coord, volname, image_size, image_volume_map)
  data_shape = [1] + image_size[::-1] + [1]
  patch = tf.reshape(patch, shape=data_shape)

  if ((FLAGS.image_stddev is None or FLAGS.image_mean is None) and
      not FLAGS.image_offset_scale_map):
    raise ValueError('--image_mean, --image_stddev or --image_offset_scale_map '
                     'need to be defined')

  # Convert segmentation into a soft object mask.
  lom = tf.logical_and(
      labels > 0,
      tf.equal(labels, labels[0,
                              label_radii[2],
                              label_radii[1],
                              label_radii[0],
                              0]))
  labels = inputs.soften_labels(lom)

  # Apply basic augmentations.
  transform_axes = augmentation.PermuteAndReflect(
      rank=5, permutable_axes=_get_permutable_axes(),
      reflectable_axes=_get_reflectable_axes())
  labels = transform_axes(labels)
  patch = transform_axes(patch)
  loss_weights = transform_axes(loss_weights)

  # Normalize image data.
  patch = inputs.offset_and_scale_patches(
      patch, volname[0],
      offset_scale_map=_get_offset_and_scale_map(),
      default_offset=FLAGS.image_mean,
      default_scale=FLAGS.image_stddev)

  # Create a batch of examples. Note that any TF operation before this line
  # will be hidden behind a queue, so expensive/slow ops can take advantage
  # of multithreading.
  #MK TODO: check num_threads usage here
  patches, labels, loss_weights = tf.train.shuffle_batch(
      [patch, labels, loss_weights], queue_batch,
      num_threads=max(1, FLAGS.batch_size // 2),
      capacity=32 * FLAGS.batch_size,
      min_after_dequeue=4 * FLAGS.batch_size,
      enqueue_many=True)

  return patches, labels, loss_weights, coord, volname
Example #28
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

        Returns:
          A tuple of the training Hit@1 and the training PERR.
        """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        model_flags_dict = {
            "model": FLAGS.model,
            "feature_sizes": FLAGS.feature_sizes,
            "feature_names": FLAGS.feature_names,
            "frame_features": FLAGS.frame_features,
            "label_loss": FLAGS.label_loss,
        }
        flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
        if file_io.file_exists(flags_json_path):
            existing_flags = json.load(
                file_io.FileIO(flags_json_path, mode="r"))
            if existing_flags != model_flags_dict:
                logging.error(
                    "Model flags do not match existing file %s. Please "
                    "delete the file, change --train_dir, or pass flag "
                    "--start_new_model", flags_json_path)
                logging.error("Ran model with flags: %s",
                              str(model_flags_dict))
                logging.error("Previously ran with flags: %s",
                              str(existing_flags))
                exit(1)
        else:
            # Write the file.
            with file_io.FileIO(flags_json_path, mode="w") as fout:
                fout.write(json.dumps(model_flags_dict))

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):

                if not meta_filename:
                    saver = self.build_model()

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

                if FLAGS.dropout:
                    keep_prob_tensor = tf.get_collection("keep_prob")[0]
                if FLAGS.noise_level > 0:
                    noise_level_tensor = tf.get_collection("noise_level")[0]
                if FLAGS.reweight:
                    weights_input, weights_assignment = None, None
                    if len(tf.get_collection("weights_input")) > 0:
                        weights_input = tf.get_collection("weights_input")[0]
                        weights_assignment = tf.get_collection(
                            "weights_assignment")[0]

        sv = tf.train.Supervisor(
            graph,
            logdir=self.train_dir,
            init_op=init_op,
            is_chief=self.is_master,
            global_step=global_step,
            save_model_secs=FLAGS.keep_checkpoint_interval * 60,
            save_summaries_secs=120,
            saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:

            # re-assign weights
            if FLAGS.reweight:
                optional_assign_weights(sess, weights_input,
                                        weights_assignment)

            steps = 0
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while not sv.should_stop():

                    steps += 1
                    batch_start_time = time.time()
                    custom_feed = {}
                    if FLAGS.dropout:
                        custom_feed[keep_prob_tensor] = FLAGS.keep_prob
                    if FLAGS.noise_level > 0:
                        custom_feed[noise_level_tensor] = FLAGS.noise_level

                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels],
                        feed_dict=custom_feed)
                    seconds_per_batch = time.time() - batch_start_time

                    if self.is_master:
                        examples_per_second = labels_val.shape[
                            0] / seconds_per_batch
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        recall = "N/A"
                        if False:
                            recall = eval_util.calculate_recall_at_n(
                                predictions_val, labels_val, FLAGS.recall_at_n)
                            sv.summary_writer.add_summary(
                                utils.MakeSummary(
                                    "model/Training_Recall@%d" %
                                    FLAGS.recall_at_n, recall),
                                global_step_val)
                            recall = "%.2f" % recall
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)

                        logging.info(
                            "%s: training step " + str(global_step_val) +
                            "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " +
                            ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) +
                            " Recall@%d: " % FLAGS.recall_at_n +
                            recall + " Loss: " + str(loss_val),
                            task_as_string(self.task))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                    if FLAGS.max_steps is not None and steps > FLAGS.max_steps:
                        logging.info(
                            "%s: Done training -- max_steps limit reached.",
                            task_as_string(self.task))
                        break

            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Example #29
0
def makeTFRecordBatches(save_path,
                        xml_path,
                        img_path,
                        batch_size,
                        words=False,
                        bounded_size_h=25,
                        bounded_size_w=350):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    totalT, totalI, totalL = [], [], []
    nr = 0

    xml_files = sorted(glob.glob(xml_path + "*.xml"))
    for i, xml in enumerate(xml_files):
        print(
            "=============================== FILE: {} / {} ==============================="
            .format(i + 1, len(xml_files)))
        name = xml.split('.xml')[0].split('/')[-1]
        img = img_path + name + '.png'
        if not os.path.isfile(img):
            continue
        ret = process_page(xml, img, words)
        if ret is None:
            continue

        texts, imgs, lengths = ret
        totalT.extend(texts)
        totalI.extend(imgs)
        totalL.extend(lengths)
        if len(totalT) >= batch_size:
            writer = tf.python_io.TFRecordWriter(
                path=save_path + "/iam_data-{}.tfrecords".format(nr))

            for ii in range(batch_size):
                imshape_1_bad = False
                imshape_2_bad = False
                old_shape = totalI[ii].shape
                if totalI[ii].shape[0] > bounded_size_h:
                    imshape_1_bad = True
                    totalI[ii] = cv2.resize(
                        totalI[ii], (totalI[ii].shape[1], bounded_size_h))
                if totalI[ii].shape[1] > bounded_size_w:
                    imshape_2_bad = True
                    totalI[ii] = cv2.resize(
                        totalI[ii], (bounded_size_w, totalI[ii].shape[0]))
                if imshape_1_bad or imshape_2_bad:
                    logging.error(
                        "Image shape was bounded: (={}) -> (={})".format(
                            old_shape, totalI[ii].shape))
                imgI = add_padding(totalI[ii],
                                   pad_to=(bounded_size_h, bounded_size_w))

                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'seq_len':
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[totalL[ii]])),
                        'target':
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=totalT[ii].astype("int64"))),
                        'imageInput':
                        tf.train.Feature(float_list=tf.train.FloatList(
                            value=(imgI).reshape(-1).astype("float")))
                    }))
                serialized = example.SerializeToString()
                writer.write(serialized)
            writer.flush()
            writer.close()
            nr += 1
            print("Batch file #({}) saved successfully to: {}!".format(
                nr, (save_path + "/iam_data-{}.tfrecords".format(nr))))
            totalT = totalT[batch_size:]
            totalI = totalI[batch_size:]
            totalL = totalL[batch_size:]
Example #30
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.
    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        model_flags_dict = {
            "model": FLAGS.model,
            "feature_sizes": FLAGS.feature_sizes,
            "feature_names": FLAGS.feature_names,
            "frame_features": FLAGS.frame_features,
            "label_loss": FLAGS.label_loss,
            "netvlad_cluster_size": FLAGS.netvlad_cluster_size,
            "netvlad_hidden_size": FLAGS.netvlad_hidden_size,
            "moe_l2": FLAGS.moe_l2,
            "iterations": FLAGS.iterations,
            "netvlad_relu": FLAGS.netvlad_relu,
            "gating": FLAGS.gating,
            "moe_num_mixtures": FLAGS.moe_num_mixtures,
            "moe_prob_gating": FLAGS.moe_prob_gating,
        }
        #     model_flags_dict = {}

        #     for k in FLAGS.__flags:
        #         model_flags_dict[k] = FLAGS[k].value
        flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
        if os.path.exists(flags_json_path):
            existing_flags = json.load(open(flags_json_path))
            if existing_flags != model_flags_dict:
                logging.error(
                    "Model flags do not match existing file %s. Please "
                    "delete the file, change --train_dir, or pass flag "
                    "--start_new_model", flags_json_path)
                logging.error("Ran model with flags: %s",
                              str(model_flags_dict))
                logging.error("Previously ran with flags: %s",
                              str(existing_flags))
                exit(1)
        else:
            # Write the file.
            with open(flags_json_path, "w") as fout:
                fout.write(json.dumps(model_flags_dict))

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:
            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):
                if not meta_filename:
                    saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=15 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % 10 == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     " PERR: " + ("%.2f" % perr) + " GAP: " +
                                     ("%.2f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Example #31
0
def train_model(model_name,
                model,
                row_start=None,
                row_end=None,
                step=None,
                initial_epoch=0,
                end_epoch=1,
                time_limit=None):
    if initial_epoch >= end_epoch:
        logging.error('initial_epoch(%d) >= end_epoch(%d).')
        return None
    if 'batch_size' not in config:
        config['batch_size'] = default_batch_size
    if 'does_shuffle' not in config:
        config['does_shuffle'] = default_does_shuffle
    if 'callbacks' not in config:
        config['callbacks'] = default_callbacks
    if 'monitored_loss_name' not in config:
        config['monitored_loss_name'] = default_monitored_loss_name
    if 'max_queue_size' not in config:
        config['max_queue_size'] = default_max_queue_size
    if 'does_use_multiprocessing' not in config:
        config['does_use_multiprocessing'] = default_does_use_multiprocessing
    if 'worker_number' not in config:
        config['worker_number'] = default_worker_number
    if 'verbose' not in config:
        config['verbose'] = default_verbose
    callbacks = list() if config['callbacks'] is not None else None
    if callbacks is not None:
        for cb in config['callbacks']:
            if isinstance(cb, keras.callbacks.Callback):
                if isinstance(cb, TimeLimiter) and time_limit is not None:
                    logging.warning(
                        'train_model: parameter time_limit is not None, ignored TimeLimiter in config.'
                    )
                    continue
                callbacks.append(cb)
            elif isinstance(cb, str):
                cb_str = cb.lower()
                cb_str = re.sub(pattern=_remove_pattern,
                                repl='',
                                string=cb_str)
                sep_idx = cb_str.find(':')
                cb_params = dict()
                if sep_idx >= 0:
                    cb_name = cb_str[:sep_idx]
                    cb_params_strs = cb_str[sep_idx + 1:].split(',')
                    for cb_param_str in cb_params_strs:
                        eq_idx = cb_param_str.find('=')
                        if eq_idx >= 0:
                            cb_params[
                                cb_param_str[:eq_idx]] = cb_param_str[eq_idx +
                                                                      1:]
                        else:
                            cb_params[cb_param_str] = '1'
                else:
                    cb_name = cb_str
                if cb_name == 'earlystopping':
                    es_monitor = config[
                        'monitored_loss_name'] if 'monitor' not in cb_params else cb_params[
                            'monitor']
                    if 'baseline' not in cb_params:
                        _, es_baseline = load_best_info(
                            model_name=model_name, monitor_name=es_monitor)
                    else:
                        es_baseline = float(cb_params['baseline'])
                    callbacks.append(
                        keras.callbacks.EarlyStopping(
                            monitor=es_monitor,
                            min_delta=EPSILON if 'min_delta' not in cb_params
                            else float(cb_params['min_delta']),
                            patience=2 if 'patience' not in cb_params else int(
                                cb_params['patience']),
                            verbose=1 if 'verbose' not in cb_params else int(
                                cb_params['verbose']),
                            mode='min'
                            if 'mode' not in cb_params else cb_params['mode'],
                            baseline=es_baseline,
                        ))
                elif cb_name == 'tensorboard':
                    callbacks.append(
                        keras.callbacks.TensorBoard(
                            log_dir=os.path.join(LOG_DIRECTORY, model_name)
                            if 'log_dir' not in cb_params else
                            cb_params['log_dir'],
                            batch_size=config['batch_size'],
                            write_graph=True if 'write_graph' not in cb_params
                            else str_to_bool(cb_params['write_graph']),
                        ))
                elif cb_name == 'modelsaver':
                    callbacks.append(
                        ModelSaver(
                            model_name=model_name,
                            period=1 if 'period' not in cb_params else int(
                                cb_params['period']),
                            verbose=1 if 'verbose' not in cb_params else int(
                                cb_params['verbose']),
                        ))
                elif cb_name == 'epochnumbersaver':
                    callbacks.append(
                        EpochNumberSaver(
                            model_name=model_name,
                            verbose=1 if 'verbose' not in cb_params else int(
                                cb_params['verbose']),
                        ))
                elif cb_name == 'bestinfosaver':
                    bi_monitor = config[
                        'monitored_loss_name'] if 'monitor' not in cb_params else cb_params[
                            'monitor']
                    if 'baseline' not in cb_params:
                        _, bi_baseline = load_best_info(
                            model_name=model_name, monitor_name=bi_monitor)
                    else:
                        bi_baseline = float(cb_params['baseline'])
                    callbacks.append(
                        BestInfoSaver(
                            model_name=model_name,
                            monitor=bi_monitor,
                            mode='min'
                            if 'mode' not in cb_params else cb_params['mode'],
                            baseline=bi_baseline,
                            verbose=1 if 'verbose' not in cb_params else int(
                                cb_params['verbose']),
                        ))
                elif cb_name == 'timelimiter':
                    if time_limit is not None:
                        logging.warning(
                            'train_model: parameter time_limit is not None, ignored TimeLimiter in config.'
                        )
                        continue
                    if 'limit' not in cb_params:
                        raise ValueError(
                            "TimeLimiter's parameter limit is missed.")
                    callbacks.append(
                        TimeLimiter(
                            limit=cb_params['limit'],
                            verbose=1 if 'verbose' not in cb_params else int(
                                cb_params['verbose']),
                        ))
                else:
                    raise UnknownCallbackNameException(cb)
            else:
                raise TypeError(
                    'Callback must be an instance of keras.callbacks.Callback or a callback name(string).'
                )
    if time_limit is not None:
        callbacks.append(TimeLimiter(limit=time_limit, verbose=1))
    rolling_window_size = get_rolling_window_size(model_name)
    generator = SquareExDataGenerator(
        dataset_name=DATASET_NAME_TRAIN,
        rolling_window_size=rolling_window_size,
        row_start=row_start,
        row_end=row_end,
        step=step,
        max_batch_size=config['batch_size'],
        does_shuffle=config['does_shuffle'],
    )
    history = model.fit_generator(
        generator=generator,
        epochs=end_epoch,
        verbose=config['verbose'],
        callbacks=callbacks,
        max_queue_size=config['max_queue_size'],
        use_multiprocessing=config['does_use_multiprocessing'],
        workers=config['worker_number'],
        initial_epoch=initial_epoch,
    )
    return history