Ejemplo n.º 1
0
    def after_run(self, run_context, run_values):

        predictions_val = run_values.results['predictions']
        labels_val = run_values.results['labels']
        step = run_values.results['step']
        hit_at_one = eval_util.calculate_hit_at_one(predictions_val,
                                                    labels_val)
        perr = eval_util.calculate_precision_at_equal_recall_rate(
            predictions_val, labels_val)
        gap = eval_util.calculate_gap(predictions_val, labels_val)

        self.writer.add_summary(
            utils.MakeSummary("model/Training_Hit@1", hit_at_one), step)
        self.writer.add_summary(utils.MakeSummary("model/Training_Perr", perr),
                                step)
        self.writer.add_summary(utils.MakeSummary("model/Training_GAP", gap),
                                step)
Ejemplo n.º 2
0
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    if not os.path.exists(self.train_dir):
      os.makedirs(self.train_dir)

    model_flags_dict = {
        "model": FLAGS.model,
        "feature_sizes": FLAGS.feature_sizes,
        "feature_names": FLAGS.feature_names,
        "frame_features": FLAGS.frame_features,
        "label_loss": FLAGS.label_loss,
    }
    flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
    if os.path.exists(flags_json_path):
      existing_flags = json.load(open(flags_json_path))
      if existing_flags != model_flags_dict:
        logging.error("Model flags do not match existing file %s. Please "
                      "delete the file, change --train_dir, or pass flag "
                      "--start_new_model",
                      flags_json_path)
        logging.error("Ran model with flags: %s", str(model_flags_dict))
        logging.error("Previously ran with flags: %s", str(existing_flags))
        exit(1)
    else:
      # Write the file.
      with open(flags_json_path, "w") as fout:

        fout.write(json.dumps(model_flags_dict))

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:
      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):
        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        labels_origin = tf.get_collection("original_labels")[0]
        sampling_distribution = tf.get_collection("sampling_probs")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        #save_model_secs=15 * 60,
        save_model_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600),
        #save_summaries_secs=120,
        save_summaries_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600),
        saver=saver)
    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:
      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):
          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val, labels_origin_val, sampling_distribution_val = sess.run(
              [train_op, global_step, loss, predictions, labels, labels_origin, sampling_distribution])
          seconds_per_batch = time.time() - batch_start_time
          examples_per_second = labels_val.shape[0] / seconds_per_batch

          ## check on the correctness of label sampling
          #if not np.all(labels_val[0]==labels_origin_val[0]):
          #    print(np.where(labels_val[0])[0], " --- ", np.where(labels_origin_val[0])[0])
          #print(np.all(labels_val == labels_origin_val, 1).mean())

          #temp_bool = np.sum(labels_val, 1)<1
          #if np.any(temp_bool):
          #  print("\n\n\nFOUND!!", np.where(labels_val[temp_bool])[0], "\n\n\n")
          #print(sampling_distribution_val[:5, :5])


          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          #if self.is_master and global_step_val % 10 == 0 and self.train_dir:
          if self.is_master and global_step_val % FLAGS.validate_every_n_training_steps == 0 and self.train_dir:
            eval_start_time = time.time()
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_origin_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                      labels_origin_val)
            gap = eval_util.calculate_gap(predictions_val, labels_origin_val)
            eval_end_time = time.time()
            eval_time = eval_end_time - eval_start_time

            logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
              " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
              ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
              " GAP: " + ("%.2f" % gap))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            with open(FLAGS.train_dir + '/global_step_{%d}_training_GAP_{%.6f}.txt' % (global_step_val, gap), 'w') as f:
              f.write('\n')

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or
                (global_step_val - self.last_model_export_step
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val
          else:
            #logging.info("training step " + str(global_step_val) + " | Loss: " +
              #("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
            continue
      except tf.errors.OutOfRangeError:
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Ejemplo n.º 3
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):
                if not meta_filename:
                    saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                restart_learning_rate = tf.get_collection(
                    "restart_learning_rate")[0]
                layers_keep_probs = tf.get_collection("layers_keep_probs")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                if FLAGS.use_ema == True:
                    ema_op = tf.get_collection("ema_op")[0]

                e_loss = tf.get_collection("e_loss")[0]
                e_labels = tf.get_collection("e_labels")[0]
                e_predictions = tf.get_collection("e_predictions")[0]

                init_op = tf.global_variables_initializer()
                restart_op = tf.assign(restart_learning_rate,
                                       FLAGS.restart_learning_rate)
                # getting a proper number of keep_prob parameters for dropout
                # max is 10 and we have to pad the vector with 1s
                # not the nicest solution, but works
                tmp_layers = []
                if FLAGS.layers_keep_probs is not None:
                    tmp_layers = [
                        float(x) for x in FLAGS.layers_keep_probs.replace(
                            ' ', '').split(',')
                    ]

                tmp_layers_padded = tmp_layers + [
                    1.0 for x in range(10 - len(tmp_layers))
                ]
                with tf.variable_scope("tower", reuse=True) as scope:
                    keep_op = tf.assign(layers_keep_probs, tmp_layers_padded)

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=FLAGS.save_model_minutes * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                if FLAGS.restart_learning_rate > 0.0:
                    sess.run(restart_op)
                    logging.info("restart learning rate: %f\n" %
                                 FLAGS.restart_learning_rate)
                if FLAGS.layers_keep_probs != "1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0":
                    logging.info("============")
                    sess.run(keep_op)
                    logging.info("layers keep probabilites: %s" %
                                 FLAGS.layers_keep_probs)
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % 50 == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " |  Loss: " + ("%.2f" % loss_val) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     "  PERR: " + ("%.2f" % perr) + "  GAP: " +
                                     ("%.4f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)

                        #also do eval
                        e_loss_val, e_predictions_val, e_labels_val = sess.run(
                            [e_loss, e_predictions, e_labels])
                        e_hit_at_one = eval_util.calculate_hit_at_one(
                            e_predictions_val, e_labels_val)
                        e_perr = eval_util.calculate_precision_at_equal_recall_rate(
                            e_predictions_val, e_labels_val)
                        e_gap = eval_util.calculate_gap(
                            e_predictions_val, e_labels_val)
                        logging.info("training step " + str(global_step_val) +
                                     " | eLoss: " + ("%.2f" % e_loss_val) +
                                     " |eHit@1: " + ("%.2f" % e_hit_at_one) +
                                     " ePERR: " + ("%.2f" % e_perr) +
                                     " eGAP: " + ("%.4f" % e_gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Eval_Hit@1",
                                              e_hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Eval_Perr", e_perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Eval_GAP", e_gap),
                            global_step_val)

                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 4
0
def model_fn(features, labels, mode, params):

    is_training = mode == learn.ModeKeys.TRAIN
    optimizer_class = find_class_by_name(params.optimizer, [tf.train])
    label_loss_fn = find_class_by_name(params.label_loss, [losses])()
    model = find_class_by_name(params.model,
                               [frame_level_models, video_level_models])()

    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.exponential_decay(
        params.base_learning_rate,
        global_step * params.batch_size * params.num_towers,
        params.learning_rate_decay_examples,
        params.learning_rate_decay,
        staircase=True,
    )

    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)

    tf.summary.histogram("model/input_raw", features['model_input'])

    feature_dim = len(features['model_input'].get_shape()) - 1

    model_input = tf.nn.l2_normalize(features['model_input'], feature_dim)

    tower_inputs = tf.split(model_input, params.num_towers)

    if mode == learn.ModeKeys.INFER:
        # ***
        #  this is a quick hack so that the existing model_fn code,
        #  taken from train.py, doesn't break in inference (or serving) mode.
        #  Normally, we would write model_fn such that the 'labels' input arg
        #  can be None in inference mode, but this existing model code was not written this
        #  way.  See the serving_input_fn() defined below, to see where 'labels_batch'
        # is added to the features dict, just to make this code work properly
        labels = features['labels_batch']

    tower_labels = tf.split(labels, params.num_towers)

    tower_num_frames = tf.split(features['num_frames'], params.num_towers)
    tower_gradients = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []

    for i in range(params.num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        with tf.device(params.device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
                with (slim.arg_scope([slim.model_variable, slim.variable],
                                     device="/cpu:0"
                                     if params.num_gpus != 1 else "/gpu:0")):
                    result = model.create_model(
                        tower_inputs[i],
                        num_frames=tower_num_frames[i],
                        vocab_size=params.reader.num_classes,
                        labels=tower_labels[i],
                        is_training=is_training)
                    for variable in slim.get_model_variables():
                        tf.summary.histogram(variable.op.name, variable)

                    predictions = result["predictions"]

                    tower_predictions.append(predictions)

                    if "loss" in result.keys():
                        label_loss = result["loss"]
                    else:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions, tower_labels[i])

                    if "regularization_loss" in result.keys():
                        reg_loss = result["regularization_loss"]
                    else:
                        reg_loss = tf.constant(0.0)

                    reg_losses = tf.losses.get_regularization_losses()
                    if reg_losses:
                        reg_loss += tf.add_n(reg_losses)

                    tower_reg_losses.append(reg_loss)

                    # Adds update_ops (e.g., moving average updates in batch normalization) as
                    # a dependency to the train_op.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if "update_ops" in result.keys():
                        update_ops += result["update_ops"]
                    if update_ops:
                        with tf.control_dependencies(update_ops):
                            barrier = tf.no_op(name="gradient_barrier")
                            with tf.control_dependencies([barrier]):
                                label_loss = tf.identity(label_loss)

                    tower_label_losses.append(label_loss)

                    final_loss = params.regularization_penalty * reg_loss + label_loss
                    gradients = optimizer.compute_gradients(
                        final_loss, colocate_gradients_with_ops=False)
                    tower_gradients.append(gradients)

    pred_dict = {}
    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    predictions = tf.concat(tower_predictions, 0)
    pred_dict['predictions'] = predictions
    tf.summary.scalar("label_loss", label_loss)
    if params.regularization_penalty != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)

    if is_training:
        # Incorporate the L2 weight penalties, etc.

        merged_gradients = utils.combine_gradients(tower_gradients)
        if params.clip_gradient_norm > 0:
            with tf.name_scope('clip_grads'):
                merged_gradients = utils.clip_gradient_norms(
                    merged_gradients, params.clip_gradient_norm)
        train_op = optimizer.apply_gradients(merged_gradients,
                                             global_step=global_step)
    else:
        train_op = None

    eval_metric_ops = {}
    if mode == learn.ModeKeys.EVAL or is_training:

        eval_metric_ops['hit_at_one'] = metrics.streaming_mean(
            tf.py_func(
                lambda x, y: np.float32(eval_util.calculate_hit_at_one(x, y)),
                [predictions, labels],
                tf.float32,
                stateful=False,
            ))
        eval_metric_ops['perr'] = metrics.streaming_mean(
            tf.py_func(
                lambda x, y: np.float32(
                    eval_util.calculate_precision_at_equal_recall_rate(x, y)),
                [predictions, labels],
                tf.float32,
                stateful=False,
            ))
        eval_metric_ops['gap'] = metrics.streaming_mean(
            tf.py_func(
                lambda x, y: np.float32(eval_util.calculate_gap(x, y)),
                [predictions, labels],
                tf.float32,
                stateful=False,
            ))

    else:
        pass
    top_predictions, top_indices = tf.nn.top_k(predictions,
                                               _TOP_PREDICTIONS_IN_OUTPUT)

    pred_dict['top_predictions'] = top_predictions
    pred_dict['top_indices'] = top_indices

    #add eval summaries and update ops for training
    for key, val in eval_metric_ops.items():
        tf.summary.scalar(key, val[0])  #create summary for each eval op
        tf.add_to_collection(
            tf.GraphKeys.UPDATE_OPS, val[1]
        )  # add the update op for each eval up to update ops collection, so that it will be run every train_op call

    #  tf.add_to_collection("global_step", global_step)
    #  tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    #  tf.add_to_collection("input_batch_raw", model_input_raw)
    #  tf.add_to_collection("input_batch", model_input)
    #  tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels, tf.float32))
    #  tf.add_to_collection("train_op", train_op)
    tf.summary.scalar("loss", label_loss)

    export_outputs = {
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
        tf.estimator.export.PredictOutput(pred_dict)
    }

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=pred_dict,
                                      loss=label_loss,
                                      train_op=train_op,
                                      export_outputs=export_outputs,
                                      eval_metric_ops=eval_metric_ops)
Ejemplo n.º 5
0
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    if not os.path.exists(self.train_dir):
      os.makedirs(self.train_dir)

    model_flags_dict = {
        "model": FLAGS.model,
        "feature_sizes": FLAGS.feature_sizes,
        "feature_names": FLAGS.feature_names,
        "frame_features": FLAGS.frame_features,
        "label_loss": FLAGS.label_loss,
    }
    flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
    if os.path.exists(flags_json_path):
      existing_flags = json.load(open(flags_json_path))
      if existing_flags != model_flags_dict:
        logging.error("Model flags do not match existing file %s. Please "
                      "delete the file, change --train_dir, or pass flag "
                      "--start_new_model",
                      flags_json_path)
        logging.error("Ran model with flags: %s", str(model_flags_dict))
        logging.error("Previously ran with flags: %s", str(existing_flags))
        exit(1)
    else:
      # Write the file.
      with open(flags_json_path, "w") as fout:
        fout.write(json.dumps(model_flags_dict))

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:
      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):
        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()
        if FLAGS.ema_source:
            # Here the variables still exsist
            ema_op = tf.get_collection("ema_op")[0]
            def_vars = tf.get_collection("updatable_vars")
            ema_vars = tf.get_collection("ema_vars")

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        save_model_secs=15 * 60,
        save_summaries_secs=120,
        saver=saver)

    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:

      if FLAGS.ema_source:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        sess.graph._unsafe_unfinalize()
        ckpt_reader = pywrap_tensorflow.NewCheckpointReader(FLAGS.ema_source)
        for xtensor, ematensor in zip(def_vars, ema_vars):
            src_tensor = ckpt_reader.get_tensor(xtensor.name.split(":")[0])
            # Loading does not take up graoh space
            xtensor.load(src_tensor, session=sess)
            ematensor.load(src_tensor, session=sess)
            # sess.run(tf.assign(xtensor, src_tensor))
            # sess.run(tf.assign(ematensor, src_tensor))
      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):
          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
              [train_op, global_step, loss, predictions, labels])
          if FLAGS.ema_source:  # Update EMA if needed
              _ = sess.run(ema_op)

          seconds_per_batch = time.time() - batch_start_time
          examples_per_second = labels_val.shape[0] / seconds_per_batch

          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          if self.is_master and global_step_val % 10 == 0 and self.train_dir:
            eval_start_time = time.time()
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                      labels_val)
            gap = eval_util.calculate_gap(predictions_val, labels_val)
            eval_end_time = time.time()
            eval_time = eval_end_time - eval_start_time

            logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
              " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
              ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
              " GAP: " + ("%.2f" % gap))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or
                (global_step_val - self.last_model_export_step
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              # self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val
          else:
            logging.info("training step " + str(global_step_val) + " | Loss: " +
              ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
      except tf.errors.OutOfRangeError:
        save_name = "{0}/model.ckpt".format(self.train_dir)
        saver.save(sess, save_name, global_step_val)
        logging.info("Final model export.")
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Ejemplo n.º 6
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):
                if not meta_filename:
                    saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                input_batch_raw = tf.get_collection("input_batch_raw")[0]
                input_batch = tf.get_collection("input_batch")[0]
                model_input_raw_ph = tf.get_collection("model_input_raw_ph")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                num_frames = tf.get_collection("num_frames")[0]
                num_frames_ph = tf.get_collection("num_frames_ph")[0]
                learning_rate = tf.get_collection("learning_rate")[0]

                init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=15 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()

                    model_input_raw_val, num_frames_val, learning_rate_val = sess.run(
                        [input_batch_raw, num_frames, learning_rate])

                    pr_feature = []
                    pr_num = []

                    for i in range(model_input_raw_val.shape[0]):
                        if num_frames_val[i] / FLAGS.max_scene <= 2:
                            num_tmp = num_frames_val[i] * np.ceil(
                                1 + FLAGS.max_scene / num_frames_val[i])
                            input_tmp = model_input_raw_val[
                                i][:num_frames_val[i]]
                            input_tmp = np.repeat(
                                input_tmp,
                                np.ceil(1 +
                                        FLAGS.max_scene / num_frames_val[i]),
                                0)
                        else:
                            num_tmp = num_frames_val[i]
                            input_tmp = model_input_raw_val[i][:num_tmp]

                        numvec = (input_tmp[1:] *
                                  input_tmp[:-1]).sum(axis=1) / (np.sqrt(
                                      (input_tmp[1:]**2).sum(1)) * (np.sqrt(
                                          (input_tmp[:-1]**2).sum(1))))
                        idx = np.sort(
                            numvec.argpartition(FLAGS.max_scene -
                                                1)[:FLAGS.max_scene - 1] + 1)

                        example_splits = np.split(input_tmp, idx, 0)

                        example_splits_mean = [
                            np.mean(example_split, 0)
                            for example_split in example_splits
                        ]
                        example_splits_mean = np.stack(example_splits_mean, 0)
                        pr_num.append(FLAGS.max_scene)
                        pr_feature.append(example_splits_mean)
                    pr_feature = np.stack(pr_feature, 0)
                    pr_num = np.stack(pr_num, 0)

                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels],
                        feed_dict={
                            model_input_raw_ph: pr_feature,
                            num_frames_ph: pr_num
                        })

                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % 10 == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     " PERR: " + ("%.2f" % perr) + " GAP: " +
                                     ("%.2f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 7
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):

                if not meta_filename:
                    saver = self.build_model()

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=15 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:

            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while not sv.should_stop():

                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time

                    if self.is_master:
                        examples_per_second = labels_val.shape[
                            0] / seconds_per_batch
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)

                        logging.info(
                            "%s: training step " + str(global_step_val) +
                            "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " +
                            ("%.2f" % perr) + " GAP: " +
                            ("%.2f" % gap) + " Loss: " + str(loss_val),
                            task_as_string(self.task))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 8
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):

                if not meta_filename:
                    saver = self.build_model()

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

                if FLAGS.dropout:
                    keep_prob_tensor = tf.get_collection("keep_prob")[0]
                if FLAGS.noise_level > 0:
                    noise_level_tensor = tf.get_collection("noise_level")[0]
                if FLAGS.reweight:
                    weights_input, weights_assignment = None, None
                    if len(tf.get_collection("weights_input")) > 0:
                        weights_input = tf.get_collection("weights_input")[0]
                        weights_assignment = tf.get_collection(
                            "weights_assignment")[0]

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=6 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:

            # re-assign weights
            if FLAGS.reweight:
                optional_assign_weights(sess, weights_input,
                                        weights_assignment)

            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while not sv.should_stop():

                    batch_start_time = time.time()
                    custom_feed = {}
                    if FLAGS.dropout:
                        custom_feed[keep_prob_tensor] = FLAGS.keep_prob
                    if FLAGS.noise_level > 0:
                        custom_feed[noise_level_tensor] = FLAGS.noise_level

                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels],
                        feed_dict=custom_feed)
                    seconds_per_batch = time.time() - batch_start_time

                    if self.is_master:
                        examples_per_second = labels_val.shape[
                            0] / seconds_per_batch
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        recall = "N/A"
                        if False:
                            recall = eval_util.calculate_recall_at_n(
                                predictions_val, labels_val, FLAGS.recall_at_n)
                            sv.summary_writer.add_summary(
                                utils.MakeSummary(
                                    "model/Training_Recall@%d" %
                                    FLAGS.recall_at_n, recall),
                                global_step_val)
                            recall = "%.2f" % recall
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)

                        logging.info(
                            "%s: training step " + str(global_step_val) +
                            "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " +
                            ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) +
                            " Recall@%d: " % FLAGS.recall_at_n +
                            recall + " Loss: " + str(loss_val),
                            task_as_string(self.task))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 9
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = []
        for filename in self.train_dir.split(','):
            logging.info("filename:%s", str(filename))
            meta_filename.append(
                self.get_meta_filename(start_new_model, filename))

        label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
        optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train])

        local_device_protos = device_lib.list_local_devices()
        gpus = [x.name for x in local_device_protos if x.device_type == 'GPU']
        num_gpus = len(gpus)

        if num_gpus > 0:
            logging.info("Using the following GPUs to train: " + str(gpus))
            num_towers = num_gpus
            device_string = '/gpu:%d'
        else:
            logging.info("No GPUs found. Training on CPU.")
            num_towers = 1
            device_string = '/cpu:%d'
        # build_graph_retrain(
        #     reader=self.reader,
        #     model=self.model,
        #     train_data_pattern=FLAGS.train_data_pattern,
        #     label_loss_fn=label_loss_fn,
        #     num_readers=FLAGS.num_readers,
        #     batch_size=FLAGS.batch_size)

        # with tf.variable_scope("net2"):

        ####

        global_step = tf.Variable(0, trainable=False, name="global_step")
        learning_rate = tf.train.exponential_decay(
            FLAGS.base_learning_rate,
            global_step * FLAGS.batch_size * num_towers,
            FLAGS.learning_rate_decay_examples,
            FLAGS.learning_rate_decay,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)
        video_id_batch, model_input_raw, labels_batch, num_frames = get_input_data_tensors(
            # pylint: disable=g-line-too-long
            self.reader,
            FLAGS.train_data_pattern,
            batch_size=FLAGS.batch_size,
            num_readers=FLAGS.num_readers)
        tf.summary.histogram("model_input_raw", model_input_raw)

        feature_dim = len(model_input_raw.get_shape()) - 1

        # Normalize input features.
        model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
        # with tf.variable_scope("net1"):
        with tf.variable_scope("tower"):

            result1 = self.model[0].create_model(
                model_input,
                num_frames=num_frames,
                vocab_size=self.reader.num_classes,
                is_training=False)
            #####

            result1 = tf.stop_gradient(result1)
            result2 = self.model[1].create_model(
                model_input,
                num_frames=num_frames,
                vocab_size=self.reader.num_classes,
                labels=labels_batch,
                is_training=False)
            result2 = tf.stop_gradient(result2)
            all_vars = tf.global_variables()
            # for v in all_vars:
            #   print v.name
            # for i in v_vars:
            #   logging.info(str(i))
            for i, v in enumerate(all_vars):
                logging.info(str(v.name))
                if 'rnn' in v.name:
                    vars1 = all_vars[:i]
                    vars2 = all_vars[i:]
                    break
            # v_vars0 = [v for v in all_vars if v.name == 'tower/input_bn/beta:0'
            #           or v.name == 'tower/input_bn/gamma:0'
            #           or v.name == 'tower/input_bn/beta:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0']
            # v_vars = [v for v in all_vars if v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/weights:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0']

            result1 = tf.nn.l2_normalize(result1, dim=1)
            result2 = tf.nn.l2_normalize(result2, dim=1)
            embeddings = tf.concat([result1, result2], axis=1)
            model_concat = find_class_by_name('MoeModel',
                                              [video_level_models])()
            result = model_concat.create_model(
                embeddings, vocab_size=self.reader.num_classes, num_mixtures=4)
            predictions = result["predictions"]
            # predictions=(result1["predictions"]+result2["predictions"])/2
            tf.summary.histogram("model_activations", predictions)
            # if "loss" in result.keys():
            #   label_loss = result["loss"]
            # else:
            label_loss = label_loss_fn.calculate_loss(predictions,
                                                      labels_batch)
            tf.summary.scalar("label_loss", label_loss)
            if "regularization_loss" in result.keys():
                reg_loss = result["regularization_loss"]
            reg_losses = tf.losses.get_regularization_losses()
            if "regularization_loss" in result.keys():
                reg_loss = result["regularization_loss"]
            else:
                reg_loss = tf.constant(0.0)
            final_loss = FLAGS.regularization_penalty * reg_loss + label_loss

            optimizer = optimizer_class(learning_rate)
            gradients = optimizer.compute_gradients(
                final_loss, colocate_gradients_with_ops=False)

            with tf.name_scope('clip_grads'):
                merged_gradients = utils.clip_gradient_norms(gradients, 1.0)
            train_op = optimizer.apply_gradients(merged_gradients,
                                                 global_step=global_step)

            tf.add_to_collection("global_step", global_step)
            tf.add_to_collection("loss", label_loss)
            tf.add_to_collection("predictions", predictions)
            tf.add_to_collection("input_batch", model_input)
            tf.add_to_collection("video_id_batch", video_id_batch)
            tf.add_to_collection("num_frames", num_frames)
            tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
            tf.add_to_collection("summary_op", tf.summary.merge_all())
            tf.add_to_collection("train_op", train_op)

            video_id_batch = tf.get_collection("video_id_batch")[0]
            prediction_batch = tf.get_collection("predictions")[0]
            label_batch = tf.get_collection("labels")[0]
            loss = tf.get_collection("loss")[0]
            summary_op = tf.get_collection("summary_op")[0]
            # saver = tf.train.Saver(tf.global_variables())
            # saver=tf.train.Saver(result1)
            summary_writer = tf.summary.FileWriter(
                FLAGS.ensemble_dir, graph=tf.get_default_graph())

            config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
            config.gpu_options.allow_growth = True

            with tf.Session(config=config) as sess:
                train_dirs = FLAGS.train_dir.split(',')
                latest_checkpoint0 = tf.train.latest_checkpoint(train_dirs[0])
                latest_checkpoint1 = tf.train.latest_checkpoint(train_dirs[1])
                sess.run(tf.global_variables_initializer())

                if latest_checkpoint0:
                    logging.info("Loading checkpoint for eval: " +
                                 latest_checkpoint0)
                    saver1 = tf.train.Saver(vars1)

                    saver1.restore(sess, latest_checkpoint0)

                if latest_checkpoint1:
                    saver2 = tf.train.Saver(vars2)
                    logging.info("Loading checkpoint for eval: " +
                                 latest_checkpoint1)

                    saver2.restore(sess, latest_checkpoint1)

                saver = tf.train.Saver()
                fetches = [
                    learning_rate, global_step, train_op, video_id_batch,
                    prediction_batch, label_batch, loss, summary_op
                ]

                coord = tf.train.Coordinator()

                threads = []
                for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
                    threads.extend(
                        qr.create_threads(sess,
                                          coord=coord,
                                          daemon=True,
                                          start=True))

                while not coord.should_stop():
                    # batch_start_time = time.time()
                    learning_rate_val, global_step_val, _, vid_val, predictions_val, labels_val, loss_val, summary_val = sess.run(
                        fetches)
                    # hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
                    # perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                    #                                                           labels_val)
                    # gap = eval_util.calculate_gap(predictions_val, labels_val)
                    # logging.info( "training step " + str(global_step_val)+" | Loss: " + ("%.2f" % loss_val) +" | Hit@1: " +
                    #              ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) +
                    #              " GAP: " + ("%.4f" % gap))

                    if self.is_master and global_step_val % self.disp_batches == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time
                        logging.info("training step " + str(global_step_val) +
                                     "| learning rate: " +
                                     ("%.4f" % learning_rate_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " | Hit@1: " + ("%.4f" % hit_at_one) +
                                     " PERR: " + ("%.4f" % perr) + " GAP: " +
                                     ("%.4f" % gap))
                        summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/loss", loss_val),
                            global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/lr", learning_rate_val),
                            global_step_val)
                        summary_writer.flush()
                        if global_step_val % FLAGS.export_model_steps == 0:
                            saver.save(sess,
                                       FLAGS.ensemble_dir,
                                       global_step=global_step_val)

                coord.request_stop()
                coord.join(threads, stop_grace_period_secs=10)
Ejemplo n.º 10
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.
    
        Returns:
          A tuple of the training Hit@1 and the training PERR.
        """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        # meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            # if meta_filename:
            #     saver = self.recover_model(meta_filename)



            with tf.device(device_fn):
                # if not meta_filename:
                self.build_model(self.model, self.reader, self.downloader)


                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                # init_op = tf.global_variables_initializer()
                #
                saver = tf.train.Saver()
                #
        config = tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False)
        config.gpu_options.allow_growth = True
        with tf.Session(config=config,graph=graph) as sess:
            enqueue_thread = threading.Thread(target=self.downloader.enqueuing, args=(sess,),
                                              kwargs={'num_epochs': FLAGS.num_epochs})
            enqueue_thread.daemon = True
            enqueue_thread.start()


            latest_checkpoint = tf.train.latest_checkpoint(self.train_dir)
            if latest_checkpoint:
                logging.info("Loading checkpoint for train: " + latest_checkpoint)
                # Restores from checkpoint
                saver.restore(sess, latest_checkpoint)
                # Assuming model_checkpoint_path looks something like:
                # /my-favorite-path/yt8m_train/model.ckpt-0, extract global_step from it.
                # global_step_val = latest_checkpoint.split("/")[-1].split("-")[-1]
            else:
                logging.info("No checkpoint file found.")

            sess.run([tf.local_variables_initializer()])

            logging.info("%s: Entering training loop.", task_as_string(self.task))
            while (not self.max_steps_reached):
                batch_start_time = time.time()
                _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                    [train_op, global_step, loss, predictions, labels])
                seconds_per_batch = time.time() - batch_start_time
                examples_per_second = labels_val.shape[0] / seconds_per_batch

                if self.max_steps and self.max_steps <= global_step_val:
                    self.max_steps_reached = True

                if self.is_master and global_step_val % self.disp_batches == 0 and self.train_dir:
                    eval_start_time = time.time()
                    hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
                    perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                              labels_val)
                    gap = eval_util.calculate_gap(predictions_val, labels_val)
                    eval_end_time = time.time()
                    eval_time = eval_end_time - eval_start_time

                    logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
                                 " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
                                 ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) +
                                 " GAP: " + ("%.4f" % gap))

                    # sv.summary_writer.add_summary(
                    #     utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                    #     global_step_val)
                    # sv.summary_writer.add_summary(
                    #     utils.MakeSummary("model/Training_Perr", perr), global_step_val)
                    # sv.summary_writer.add_summary(
                    #     utils.MakeSummary("model/Training_GAP", gap), global_step_val)
                    # sv.summary_writer.add_summary(
                    #     utils.MakeSummary("global_step/Examples/Second",
                    #                       examples_per_second), global_step_val)
                    # sv.summary_writer.flush()

                    # Exporting the model every x steps
                    time_to_export = ((self.last_model_export_step == 0) or
                                      (global_step_val - self.last_model_export_step
                                       >= self.export_model_steps))

                    if self.is_master and time_to_export:
                        self.export_model(global_step_val, saver, self.train_dir, sess)
                        self.last_model_export_step = global_step_val
                # else:
                logging.info("training step " + str(global_step_val) + " | Loss: " +
                             ("%.4f" % loss_val) + " Examples/sec: " + ("%.4f" % examples_per_second))



            self.downloader.stop()
            logging.info('Donwloader stopped')

            logging.info("%s: Exited training loop.", task_as_string(self.task))
Ejemplo n.º 11
0
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    if not os.path.exists(self.train_dir):
      os.makedirs(self.train_dir)

    model_flags_dict = {
        "model": FLAGS.model,
        "feature_sizes": FLAGS.feature_sizes,
        "feature_names": FLAGS.feature_names,
        "frame_features": FLAGS.frame_features,
        "label_loss": FLAGS.label_loss,
    }
    flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
    if os.path.exists(flags_json_path):
      existing_flags = json.load(open(flags_json_path))
      if existing_flags != model_flags_dict:
        logging.error("Model flags do not match existing file %s. Please "
                      "delete the file, change --train_dir, or pass flag "
                      "--start_new_model",
                      flags_json_path)
        logging.error("Ran model with flags: %s", str(model_flags_dict))
        logging.error("Previously ran with flags: %s", str(existing_flags))
        exit(1)
    else:
      # Write the file.
      with open(flags_json_path, "w") as fout:
        fout.write(json.dumps(model_flags_dict))

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:
      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):
        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        save_model_secs=15 * 60,
        save_summaries_secs=120,
        saver=saver)

    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:
      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):
          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
              [train_op, global_step, loss, predictions, labels])
          seconds_per_batch = time.time() - batch_start_time
          examples_per_second = labels_val.shape[0] / seconds_per_batch

          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          if self.is_master and global_step_val % 10 == 0 and self.train_dir:
            eval_start_time = time.time()
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                                                                      labels_val)
            gap = eval_util.calculate_gap(predictions_val, labels_val)
            eval_end_time = time.time()
            eval_time = eval_end_time - eval_start_time

            logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
              " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
              ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
              " GAP: " + ("%.2f" % gap))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or
                (global_step_val - self.last_model_export_step
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val
          else:
            logging.info("training step " + str(global_step_val) + " | Loss: " +
              ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
      except tf.errors.OutOfRangeError:
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Ejemplo n.º 12
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

        Returns:
          A tuple of the training Hit@1 and the training PERR.
        """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):

                if not meta_filename:
                    saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                input_batch_raw = tf.get_collection("input_batch_raw")[0]
                is_neg = tf.get_collection("is_negative")[0]
                init_op = tf.global_variables_initializer()

                if FLAGS.model == "EmbeddingModel":
                    hidden_layer_activations = tf.get_collection("hidden_layer_activations")[0]

        sv = tf.train.Supervisor(
            graph,
            logdir=self.train_dir,
            init_op=init_op,
            is_chief=self.is_master,
            global_step=global_step,
            save_model_secs=15 * 60,
            save_summaries_secs=120,
            saver=saver)

        logging.info("%s: Starting managed session.", task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:

            try:
                logging.info("%s: Entering training loop.", task_as_string(self.task))
                batch_counter = 0
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_counter += 1
                    batch_start_time = time.time()
                    if FLAGS.model == "EmbeddingModel":
                        _, global_step_val, loss_val, predictions_val, labels_val, input_batch_raw_val, embeddings, is_neg_val = sess.run(
                            [train_op, global_step, loss, predictions, labels, input_batch_raw,
                             hidden_layer_activations, is_neg])
                    else:
                        _, global_step_val, loss_val, predictions_val, labels_val, input_batch_raw_val = sess.run(
                            [train_op, global_step, loss, predictions, labels, input_batch_raw])

                    seconds_per_batch = time.time() - batch_start_time

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master:
                        k = 10
                        examples_per_second = labels_val.shape[0] / seconds_per_batch
                        predictions_val = predictions_val[:, 0:4716]
                        hit_at_one = eval_util.calculate_hit_at_one(predictions_val,
                                                                    labels_val)

                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val, labels_val)

                        if FLAGS.model == "EmbeddingModel" \
                                          "":
                            logging.info(is_neg_val[1])
                            hit_emb = eval_util.calculate_hit_at_k_embedding(embeddings, k)
                            logging.info(numpy.sum(numpy.multiply(embeddings[1, 0:FLAGS.embedding_size], embeddings[1, FLAGS.embedding_size:2 * FLAGS.embedding_size])))
                            logging.info("%s Training step " + str(global_step_val) + "| Hit@1: " +
                                         ("%.2f" % hit_at_one) + " HitEmbedding@" + ("%.0f: " % k) + (
                                         "%.2f" % hit_emb) + " GAP: " +
                                         ("%.2f" % gap) + " Loss: " + str(loss_val), task_as_string(self.task))
                            sv.summary_writer.add_summary(
                                utils.MakeSummary("model/Training_HitEmbedding@10", hit_emb),
                                global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second), global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = ((self.last_model_export_step == 0) or
                                          (global_step_val - self.last_model_export_step
                                           >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver, sv.save_path, sess)
                            self.last_model_export_step = global_step_val

                    if FLAGS.model == "EmbeddingModel":

                        if FLAGS.image_server & (batch_counter == 9000):
                            pred_audio = np.asarray(predictions_val[1, 0:FLAGS.embedding_size])
                            pred_frames = np.asarray(predictions_val[1, FLAGS.embedding_size:2 * FLAGS.embedding_size])
                            # plt.bar(range(1, 129), pred_audio / np.linalg.norm(pred_audio))
                            # plt.savefig("embedding_audio2.png")
                            # plt.cla()
                            # plt.bar(range(1, 129), pred_frames / np.linalg.norm(pred_frames))
                            # plt.savefig("embedding_frames2.png")
                            # plt.cla()
                            # plt.bar(range(1, 129),
                            #         pred_frames / np.linalg.norm(pred_frames) - pred_audio / np.linalg.norm(pred_audio))
                            # plt.savefig("embedding_diferencia2.png")
                            # plt.cla()
                            # logging.info("Imatges guardades")

                # Exporting the final model
                if self.is_master:
                    self.export_model(global_step_val, sv.saver, sv.save_path, sess)


            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
        print("Hem acabat")
Ejemplo n.º 13
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

        Returns:
          A tuple of the training Hit@1 and the training PERR.
        """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        model_flags_dict = {
            "model": FLAGS.model,
            "feature_sizes": FLAGS.feature_sizes,
            "feature_names": FLAGS.feature_names,
            "frame_features": FLAGS.frame_features,
            "label_loss": FLAGS.label_loss,
        }
        flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
        if file_io.file_exists(flags_json_path):
            existing_flags = json.load(
                file_io.FileIO(flags_json_path, mode="r"))
            if existing_flags != model_flags_dict:
                logging.error(
                    "Model flags do not match existing file %s. Please "
                    "delete the file, change --train_dir, or pass flag "
                    "--start_new_model", flags_json_path)
                logging.error("Ran model with flags: %s",
                              str(model_flags_dict))
                logging.error("Previously ran with flags: %s",
                              str(existing_flags))
                exit(1)
        else:
            # Write the file.
            with file_io.FileIO(flags_json_path, mode="w") as fout:
                fout.write(json.dumps(model_flags_dict))

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):

                if not meta_filename:
                    saver = self.build_model()

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

                if FLAGS.dropout:
                    keep_prob_tensor = tf.get_collection("keep_prob")[0]
                if FLAGS.noise_level > 0:
                    noise_level_tensor = tf.get_collection("noise_level")[0]
                if FLAGS.reweight:
                    weights_input, weights_assignment = None, None
                    if len(tf.get_collection("weights_input")) > 0:
                        weights_input = tf.get_collection("weights_input")[0]
                        weights_assignment = tf.get_collection(
                            "weights_assignment")[0]

        sv = tf.train.Supervisor(
            graph,
            logdir=self.train_dir,
            init_op=init_op,
            is_chief=self.is_master,
            global_step=global_step,
            save_model_secs=FLAGS.keep_checkpoint_interval * 60,
            save_summaries_secs=120,
            saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:

            # re-assign weights
            if FLAGS.reweight:
                optional_assign_weights(sess, weights_input,
                                        weights_assignment)

            steps = 0
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while not sv.should_stop():

                    steps += 1
                    batch_start_time = time.time()
                    custom_feed = {}
                    if FLAGS.dropout:
                        custom_feed[keep_prob_tensor] = FLAGS.keep_prob
                    if FLAGS.noise_level > 0:
                        custom_feed[noise_level_tensor] = FLAGS.noise_level

                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels],
                        feed_dict=custom_feed)
                    seconds_per_batch = time.time() - batch_start_time

                    if self.is_master:
                        examples_per_second = labels_val.shape[
                            0] / seconds_per_batch
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        recall = "N/A"
                        if False:
                            recall = eval_util.calculate_recall_at_n(
                                predictions_val, labels_val, FLAGS.recall_at_n)
                            sv.summary_writer.add_summary(
                                utils.MakeSummary(
                                    "model/Training_Recall@%d" %
                                    FLAGS.recall_at_n, recall),
                                global_step_val)
                            recall = "%.2f" % recall
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)

                        logging.info(
                            "%s: training step " + str(global_step_val) +
                            "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " +
                            ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) +
                            " Recall@%d: " % FLAGS.recall_at_n +
                            recall + " Loss: " + str(loss_val),
                            task_as_string(self.task))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                    if FLAGS.max_steps is not None and steps > FLAGS.max_steps:
                        logging.info(
                            "%s: Done training -- max_steps limit reached.",
                            task_as_string(self.task))
                        break

            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 14
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        logging.info(
            '############## PARAMETERS ##############################')
        logging.info("feature_names {}".format(FLAGS.feature_names))
        logging.info("feature_sizes {}".format(FLAGS.feature_sizes))
        logging.info("frame_features {}".format(FLAGS.frame_features))
        logging.info("model {}".format(FLAGS.model))
        logging.info("start_new_model {}".format(FLAGS.start_new_model))
        logging.info("num_gpu {}".format(FLAGS.num_gpu))
        logging.info("batch_size {}".format(FLAGS.batch_size))
        logging.info("label_loss {}".format(FLAGS.label_loss))
        logging.info("regularization_penalty {}".format(
            FLAGS.regularization_penalty))
        logging.info("base_learning_rate {}".format(FLAGS.base_learning_rate))
        logging.info("learning_rate_decay {}".format(
            FLAGS.learning_rate_decay))
        logging.info("learning_rate_decay_examples {}".format(
            FLAGS.learning_rate_decay_examples))
        logging.info("num_epochs {}".format(FLAGS.num_epochs))
        logging.info("max_steps {}".format(FLAGS.max_steps))
        logging.info("export_model_steps {}".format(FLAGS.export_model_steps))
        logging.info("num_readers {}".format(FLAGS.num_readers))
        logging.info("optimizer {}".format(FLAGS.optimizer))
        logging.info("clip_gradient_norm {}".format(FLAGS.clip_gradient_norm))
        logging.info(
            '########################################################')
        logging.info(' '.join([x for x in sys.argv]))

        model_flags_dict = {
            "model": FLAGS.model,
            "feature_sizes": FLAGS.feature_sizes,
            "feature_names": FLAGS.feature_names,
            "frame_features": FLAGS.frame_features,
            "label_loss": FLAGS.label_loss,
        }
        flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
        if os.path.exists(flags_json_path):
            existing_flags = json.load(open(flags_json_path))
            if existing_flags != model_flags_dict:
                logging.error(
                    "Model flags do not match existing file %s. Please "
                    "delete the file, change --train_dir, or pass flag "
                    "--start_new_model", flags_json_path)
                logging.error("Ran model with flags: %s",
                              str(model_flags_dict))
                logging.error("Previously ran with flags: %s",
                              str(existing_flags))
                exit(1)
        else:
            # Write the file.
            with open(flags_json_path, "w") as fout:
                fout.write(json.dumps(model_flags_dict))

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            with tf.device(device_fn):

                saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]

                init_op, init_fn = None, None
                if meta_filename:
                    saver = tf.train.Saver(tf.global_variables(),
                                           max_to_keep=0,
                                           keep_checkpoint_every_n_hours=0.25)

                    def init_fn(sess):
                        return saver.restore(sess, meta_filename)
                else:
                    init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 init_fn=init_fn,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=40 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % 10 == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     " PERR: " + ("%.2f" % perr) + " GAP: " +
                                     ("%.2f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 15
0
def main():
    env = json.loads(os.environ.get("TF_CONFIG", "{}"))

    task_data = env.get("task", None) or {"type": "master", "index": 0}
    task = type("TaskSpec", (object, ), task_data)

    logging.set_verbosity(tf.logging.INFO)
    logging.info("%s: Tensorflow version: %s.", task_as_string(task),
                 tf.__version__)

    video_ids, video_features, video_labels, video_frames = gen_input(
        data_pattern,
        reader_batch_size=reader_batch_size,
        num_classes=num_classes,
        num_readers=num_readers,
        mini_batch_size=mini_batch_size)

    result = gen_model(model_input=video_features,
                       vocab_size=num_classes,
                       labels=video_labels,
                       num_frames=video_frames)

    predictions = result["predictions"]

    global_step = tf.Variable(0, trainable=False, name="global_step")

    label_loss = label_loss_fn.calculate_loss(predictions, video_labels)

    if "regularization_loss" in result.keys():
        reg_loss = result["regularization_loss"]
    else:
        reg_loss = tf.constant(0.0)

    reg_losses = tf.losses.get_regularization_losses()
    if reg_losses:
        reg_loss += tf.add_n(reg_losses)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if "update_ops" in result.keys():
        update_ops += result["update_ops"]

    if update_ops:
        with tf.control_dependencies(update_ops):
            barrier = tf.no_op(name="gradient_barrier")
            with tf.control_dependencies([barrier]):
                label_loss = tf.identity(label_loss)

    final_loss = regularization_penalty * reg_loss + label_loss

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * mini_batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)

    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)

    gradients = optimizer.compute_gradients(final_loss,
                                            colocate_gradients_with_ops=False)

    tf.summary.scalar("label_loss", label_loss)

    tf.summary.scalar("reg_loss", reg_loss)

    if clip_gradient_norm > 0:
        with tf.name_scope('clip_grads'):
            gradients = utils.clip_gradient_norms(gradients,
                                                  clip_gradient_norm)

    train_op = optimizer.apply_gradients(gradients, global_step=global_step)

    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()

        #init_local_op = tf.local_variables_initializer()
        #sess.run(init_local_op)

        coord = tf.train.Coordinator()

        threads = tf.train.start_queue_runners(coord=coord)

        total_step = 0

        try:
            while total_step < 100000:
                batch_start_time = time.time()

                # v_ids, v_features, v_labels, v_frames = sess.run([video_ids, video_features, video_labels, video_frames])

                _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                    [
                        train_op, global_step, label_loss, predictions,
                        tf.cast(video_labels, tf.float32)
                    ])

                seconds_per_batch = time.time() - batch_start_time
                examples_per_second = labels_val.shape[0] / seconds_per_batch

                # if max_steps <= global_step_val:
                #    max_steps_reached = True
                # print(v_features.shape)
                # print(v_ids)

                if total_step % 10 == 0:
                    eval_start_time = time.time()
                    hit_at_one = eval_util.calculate_hit_at_one(
                        predictions_val, labels_val)
                    perr = eval_util.calculate_precision_at_equal_recall_rate(
                        predictions_val, labels_val)
                    gap = eval_util.calculate_gap(predictions_val, labels_val)
                    eval_end_time = time.time()
                    eval_time = eval_end_time - eval_start_time

                    logging.info("training step " + str(global_step_val) +
                                 " | Loss: " + ("%.2f" % loss_val) +
                                 " Examples/sec: " +
                                 ("%.2f" % examples_per_second) +
                                 " | Hit@1: " + ("%.2f" % hit_at_one) +
                                 " PERR: " + ("%.2f" % perr) + " GAP: " +
                                 ("%.2f" % gap))

                else:
                    logging.info("training step " + str(global_step_val) +
                                 " | Loss: " + ("%.2f" % loss_val) +
                                 " Examples/sec: " +
                                 ("%.2f" % examples_per_second))

                total_step = total_step + 1

        except tf.errors.OutOfRangeError:
            logging.info("%s: Done training -- epoch limit reached.",
                         task_as_string(task))

        coord.request_stop()

        coord.join(threads)
Ejemplo n.º 16
0
def train():
    '''
        模型,训练任务
        :return:
    '''
    valid_max_accuracy = -9999
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False)  # 如果有gpu,则优先用gpu,否则走cpu资源
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:

        data_object = DataProcessing(args)
        train_batch_example = data_object.input_frame_data(
            frame_path=args.train_path,
            batch_size=args.batch_size,
            num_epoch=args.epoch)

        model = creat_model(sess, args, isTraining=True)  # 构建模型计算图
        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=3)  # max_to_keep 表征只保留最好的3个模型
        print("Begin training..")

        train_cate1_perr_list = []
        train_cate1_gap_list = []
        train_cate2_perr_list = []
        train_cate2_gap_list = []
        train_total_loss_list = []

        try:
            while True:
                context_parsed, sequence_parsed = sess.run(train_batch_example)

                batch_origin_labels = [
                    np.nonzero(row)[0].tolist()
                    for row in context_parsed['labels']
                ]
                cate1_multilabel, cate2_multilabel, batch_origin_cate1, batch_origin_cate2 = data_object.get_cate1_cate2_label(
                    batch_origin_labels)
                batch_vid_name = np.asarray(context_parsed['id'])
                batch_num_audio_rgb_true_frame = np.asarray(
                    context_parsed['num_audio_rgb_true_frame'])
                batch_cate1_label_multiHot = np.asarray(
                    cate1_multilabel)  # batch,cate1_nums
                batch_cate2_label_multiHot = np.asarray(
                    cate2_multilabel)  # batch,cate2_nums
                batch_rgb_fea_float_list = np.asarray(
                    sequence_parsed['rgb'])  # batch,max_frame,1024
                batch_audio_fea_float_list = np.asarray(
                    sequence_parsed['audio'])  # batch,max_frame,1
                # print("batch_vid_name.shape:",batch_vid_name.shape)
                # print("batch_cate1_label_multiHot.shape: ",batch_cate1_label_multiHot.shape)
                # print("batch_cate2_label_multiHot.shape: ",batch_cate2_label_multiHot.shape)
                # print("batch_rgb_fea_float_list.shape: ",batch_rgb_fea_float_list.shape)
                # print("batch_audio_fea_float_list.shape: ",batch_audio_fea_float_list.shape)
                # print("batch_num_audio_rgb_true_frame: ",batch_num_audio_rgb_true_frame)
                # print("batch_num_audio_rgb_true_frame: ", np.asarray(batch_num_audio_rgb_true_frame).shape)
                # assert 1==2

                feed = dict(
                    zip([
                        model.input_video_vidName,
                        model.input_cate1_multilabel,
                        model.input_cate2_multilabel,
                        model.input_video_RGB_feature,
                        model.input_video_Audio_feature,
                        model.input_rgb_audio_true_frame,
                        model.dropout_keep_prob
                    ], [
                        batch_vid_name, batch_cate1_label_multiHot,
                        batch_cate2_label_multiHot, batch_rgb_fea_float_list,
                        batch_audio_fea_float_list,
                        batch_num_audio_rgb_true_frame, 0.5
                    ]))

                cate1_probs, cate2_probs, total_loss, _ = sess.run([
                    model.cate1_probs, model.cate2_probs, model.total_loss,
                    model.optimizer
                ], feed)

                train_cate1_perr = eval_util.calculate_precision_at_equal_recall_rate(
                    cate1_probs, batch_cate1_label_multiHot)
                train_cate1_gap = eval_util.calculate_gap(
                    cate1_probs, batch_cate1_label_multiHot)
                train_cate2_perr = eval_util.calculate_precision_at_equal_recall_rate(
                    cate2_probs, batch_cate2_label_multiHot)
                train_cate2_gap = eval_util.calculate_gap(
                    cate2_probs, batch_cate2_label_multiHot)

                train_cate1_perr_list.append(train_cate1_perr)
                train_cate1_gap_list.append(train_cate1_gap)
                train_cate2_perr_list.append(train_cate2_perr)
                train_cate2_gap_list.append(train_cate2_gap)

                train_total_loss_list.append(total_loss)

                if model.global_step.eval() % args.report_freq == 0:
                    print("report_freq: ", args.report_freq)

                    print(
                        'cate1_train: Step:{} ; aver_train_cate1_perr:{} ; aver_train_cate1_gap_list:{} ; aver_total_loss:{}'
                        .format(
                            model.global_step.eval(),
                            1.0 * np.sum(train_cate1_perr_list) /
                            len(train_cate1_perr_list),
                            1.0 * np.sum(train_cate1_gap_list) /
                            len(train_cate1_gap_list),
                            1.0 * np.sum(train_total_loss_list) /
                            len(train_total_loss_list)))

                    print(
                        'cate2_train: Step:{} ; aver_train_cate2_perr:{} ; aver_train_cate2_gap_list:{} ; aver_total_loss:{}'
                        .format(
                            model.global_step.eval(),
                            1.0 * np.sum(train_cate2_perr_list) /
                            len(train_cate2_perr_list),
                            1.0 * np.sum(train_cate2_gap_list) /
                            len(train_cate2_gap_list),
                            1.0 * np.sum(train_total_loss_list) /
                            len(train_total_loss_list)))

                    train_cate1_perr_list = []
                    train_cate1_gap_list = []
                    train_cate2_perr_list = []
                    train_cate2_gap_list = []
                    train_total_loss_list = []

                if model.global_step.eval(
                ) > 1 and model.global_step.eval() % args.valid_freq == 0:
                    # 统计验证集的准确率
                    print("valid infer is process  111!")
                    print('model.global_step.eval(): ',
                          model.global_step.eval())

                    valid_cate2_gap_aver_loss = eval()
                    # 保存当前验证集合准确率最高的模型
                    if valid_cate2_gap_aver_loss > valid_max_accuracy:
                        print("save the model, step= : ",
                              model.global_step.eval())
                        valid_max_accuracy = valid_cate2_gap_aver_loss
                        checkpoint_path = os.path.join(args.model_dir,
                                                       'model.ckpt')
                        saver.save(sess=sess,
                                   save_path=checkpoint_path,
                                   global_step=model.global_step.eval())

        except tf.errors.OutOfRangeError:
            print("train processing is finished!")
Ejemplo n.º 17
0
def eval():
    g1 = tf.Graph()
    with g1.as_default():
        config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False)  # 如果有gpu,则优先用gpu,否则走cpu资源
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:

            data_object = DataProcessing(args)

            valid_batch_example = data_object.input_frame_data(
                frame_path=args.valid_path, batch_size=160, num_epoch=1)
            model = creat_model(sess, args, isTraining=False)  # 构建模型计算图

            # app_tag_model函数: 模型计算图的具体实现
            ckpt = tf.train.get_checkpoint_state(args.model_dir)

            # 判断是否存在模型
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                # print("Reloading model parameters..")
                print("valid step: Reading model parameters from {}".format(
                    ckpt.model_checkpoint_path))
                saver = tf.train.Saver(tf.global_variables())
                saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path
                              )  # 调用saver接口,将各个tensor变量的值赋给对应的tensor
                print(sess.run(model.global_step))
            else:
                if not os.path.exists(args.model_dir):
                    os.makedirs(args.model_dir)
                print("valid step: Created new model parameters..")
                sess.run(tf.global_variables_initializer())

            valid_cate2_perr_list = []
            valid_cate2_gap_list = []
            valid_total_loss_list = []

            try:
                while True:
                    context_parsed, sequence_parsed = sess.run(
                        valid_batch_example)
                    batch_origin_labels = [
                        np.nonzero(row)[0].tolist()
                        for row in context_parsed['labels']
                    ]
                    cate1_multilabel, cate2_multilabel, batch_origin_cate1, batch_origin_cate2 = data_object.get_cate1_cate2_label(
                        batch_origin_labels)
                    batch_vid_name = np.asarray(context_parsed['id'])
                    batch_cate1_label_multiHot = np.asarray(
                        cate1_multilabel)  # batch,cate1_nums
                    batch_cate2_label_multiHot = np.asarray(
                        cate2_multilabel)  # batch,cate2_nums
                    batch_rgb_fea_float_list = np.asarray(
                        sequence_parsed['rgb'])  # batch,max_frame,1024
                    batch_audio_fea_float_list = np.asarray(
                        sequence_parsed['audio'])  # batch,max_frame,128
                    batch_num_audio_rgb_true_frame = np.asarray(
                        context_parsed['num_audio_rgb_true_frame'])

                    feed = dict(
                        zip([
                            model.input_video_vidName,
                            model.input_cate1_multilabel,
                            model.input_cate2_multilabel,
                            model.input_video_RGB_feature,
                            model.input_video_Audio_feature,
                            model.input_rgb_audio_true_frame,
                            model.dropout_keep_prob
                        ], [
                            batch_vid_name, batch_cate1_label_multiHot,
                            batch_cate2_label_multiHot,
                            batch_rgb_fea_float_list,
                            batch_audio_fea_float_list,
                            batch_num_audio_rgb_true_frame, 1.0
                        ]))

                    cate2_probs, total_loss = sess.run(
                        [model.cate2_probs, model.total_loss], feed)

                    cate2_perr = eval_util.calculate_precision_at_equal_recall_rate(
                        cate2_probs, batch_cate2_label_multiHot)
                    cate2_gap = eval_util.calculate_gap(
                        cate2_probs, batch_cate2_label_multiHot)

                    valid_cate2_perr_list.append(cate2_perr)
                    valid_cate2_gap_list.append(cate2_gap)

                    valid_total_loss_list.append(total_loss)

            except tf.errors.OutOfRangeError:
                print("end!")

                valid_cate2_perr_aver_loss = 1.0 * np.sum(
                    valid_cate2_perr_list) / len(valid_cate2_perr_list)
                valid_cate2_gap_aver_loss = 1.0 * np.sum(
                    valid_cate2_gap_list) / len(valid_cate2_gap_list)

                valid_total_valid_aver_loss = 1.0 * np.sum(
                    valid_total_loss_list) / len(valid_total_loss_list)

                print('total valid cate2_perr_aver_loss: %0.4f' %
                      valid_cate2_perr_aver_loss)
                print('total valid cate2_gap_aver_loss: %0.4f' %
                      valid_cate2_gap_aver_loss)
                print('***********************')
                print('total valid total_valid_aver_loss: %0.4f' %
                      valid_total_valid_aver_loss)

            return valid_cate2_gap_aver_loss
Ejemplo n.º 18
0
  def run(self, start_new_model=False):
    """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
    if self.is_master and start_new_model:
      self.remove_training_directory(self.train_dir)

    target, device_fn = self.start_server_if_distributed()

    meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

    with tf.Graph().as_default() as graph:

      if meta_filename:
        saver = self.recover_model(meta_filename)

      with tf.device(device_fn):

        if not meta_filename:
          saver = self.build_model(self.model, self.reader)

        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()

    sv = tf.train.Supervisor(
        graph,
        logdir=self.train_dir,
        init_op=init_op,
        is_chief=self.is_master,
        global_step=global_step,
        save_model_secs=15 * 60,
        save_summaries_secs=120,
        saver=saver)

    logging.info("%s: Starting managed session.", task_as_string(self.task))
    with sv.managed_session(target, config=self.config) as sess:

      try:
        logging.info("%s: Entering training loop.", task_as_string(self.task))
        while (not sv.should_stop()) and (not self.max_steps_reached):

          batch_start_time = time.time()
          _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
              [train_op, global_step, loss, predictions, labels])
          seconds_per_batch = time.time() - batch_start_time

          if self.max_steps and self.max_steps <= global_step_val:
            self.max_steps_reached = True

          if self.is_master:
            examples_per_second = labels_val.shape[0] / seconds_per_batch
            hit_at_one = eval_util.calculate_hit_at_one(predictions_val,
                                                        labels_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(
                predictions_val, labels_val)
            gap = eval_util.calculate_gap(predictions_val, labels_val)

            logging.info(
                "%s: training step " + str(global_step_val) + "| Hit@1: " +
                ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " +
                ("%.2f" % gap) + " Loss: " + str(loss_val),
                task_as_string(self.task))

            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_Perr", perr), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("model/Training_GAP", gap), global_step_val)
            sv.summary_writer.add_summary(
                utils.MakeSummary("global_step/Examples/Second",
                                  examples_per_second), global_step_val)
            sv.summary_writer.flush()

            # Exporting the model every x steps
            time_to_export = ((self.last_model_export_step == 0) or 
                (global_step_val - self.last_model_export_step 
                 >= self.export_model_steps))

            if self.is_master and time_to_export:
              self.export_model(global_step_val, sv.saver, sv.save_path, sess)
              self.last_model_export_step = global_step_val

        # Exporting the final model
        if self.is_master:
          self.export_model(global_step_val, sv.saver, sv.save_path, sess)

      except tf.errors.OutOfRangeError:
        logging.info("%s: Done training -- epoch limit reached.",
                     task_as_string(self.task))

    logging.info("%s: Exited training loop.", task_as_string(self.task))
    sv.Stop()
Ejemplo n.º 19
0
def train_loop(train_dir=None,
               saver=None,
               is_chief=True,
               master="",
               start_supervisor_services=True):
    """Performs training on the currently defined tensorflow graph.

  Args:
    train_dir: Where to save the model checkpoints.
    saver: The class to use for serializing the graph variables.
    is_chief: Whether this worker is the primary worker (which is responsible
    for writing checkpoints and summaries), or an anonymous member of the flock.
    master: Which Tensorflow master to listen to.
    start_supervisor_services: Whether to start threads for writing summaries
      and checkpoints.

  Returns:
  A tuple of the training Hit@1 and the training PERR.
  """
    global_step = tf.get_collection("global_step")[0]
    loss = tf.get_collection("loss")[0]
    predictions = tf.get_collection("predictions")[0]
    labels = tf.get_collection("labels")[0]
    train_op = tf.get_collection("train_op")[0]

    sv = tf.train.Supervisor(logdir=train_dir,
                             is_chief=is_chief,
                             global_step=global_step,
                             save_model_secs=60,
                             save_summaries_secs=60,
                             saver=saver)
    sess = sv.prepare_or_wait_for_session(
        master,
        start_standard_services=start_supervisor_services,
        config=tf.ConfigProto(log_device_placement=False))

    logging.info("prepared session")
    sv.start_queue_runners(sess)
    logging.info("started queue runners")

    try:
        logging.info("entering training loop")
        while not sv.should_stop():
            batch_start_time = time.time()
            _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                [train_op, global_step, loss, predictions, labels])
            seconds_per_batch = time.time() - batch_start_time
            examples_per_second = labels_val.shape[0] / seconds_per_batch

            hit_at_one = eval_util.calculate_hit_at_one(
                predictions_val, labels_val)
            perr = eval_util.calculate_precision_at_equal_recall_rate(
                predictions_val, labels_val)
            gap = eval_util.calculate_gap(predictions_val, labels_val)

            logging.info("training step " + str(global_step_val) +
                         "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " +
                         ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) +
                         " Loss: " + str(loss_val))
            if is_chief and global_step_val % 10 == 0 and train_dir:
                sv.summary_writer.add_summary(
                    utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                    global_step_val)
                sv.summary_writer.add_summary(
                    utils.MakeSummary("model/Training_Perr", perr),
                    global_step_val)
                sv.summary_writer.add_summary(
                    utils.MakeSummary("global_step/Examples/Second",
                                      examples_per_second), global_step_val)
                sv.summary_writer.flush()
    except tf.errors.OutOfRangeError:
        logging.info("Done training -- epoch limit reached")
    logging.info("exited training loop")
    sv.Stop()
    return hit_at_one, perr
Ejemplo n.º 20
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        latest_checkpoint, meta_filename = self.get_meta_filename(
            start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:

            if meta_filename:
                if not FLAGS.change_file:
                    saver = self.recover_model(meta_filename)

            with tf.device(device_fn):
                if not meta_filename:
                    saver = self.build_model(self.model, self.reader)
                if FLAGS.change_file:
                    saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=60 * FLAGS.time_to_save_model,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))

                logging.info("TANG:restoring")
                saver.restore(sess, latest_checkpoint)

                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % FLAGS.eval_loop == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     " PERR: " + ("%.2f" % perr) + " GAP: " +
                                     ("%.2f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 21
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.
    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        model_flags_dict = {
            "model": FLAGS.model,
            "feature_sizes": FLAGS.feature_sizes,
            "feature_names": FLAGS.feature_names,
            "frame_features": FLAGS.frame_features,
            "label_loss": FLAGS.label_loss,
            "netvlad_cluster_size": FLAGS.netvlad_cluster_size,
            "netvlad_hidden_size": FLAGS.netvlad_hidden_size,
            "moe_l2": FLAGS.moe_l2,
            "iterations": FLAGS.iterations,
            "netvlad_relu": FLAGS.netvlad_relu,
            "gating": FLAGS.gating,
            "moe_num_mixtures": FLAGS.moe_num_mixtures,
            "moe_prob_gating": FLAGS.moe_prob_gating,
        }
        #     model_flags_dict = {}

        #     for k in FLAGS.__flags:
        #         model_flags_dict[k] = FLAGS[k].value
        flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json")
        if os.path.exists(flags_json_path):
            existing_flags = json.load(open(flags_json_path))
            if existing_flags != model_flags_dict:
                logging.error(
                    "Model flags do not match existing file %s. Please "
                    "delete the file, change --train_dir, or pass flag "
                    "--start_new_model", flags_json_path)
                logging.error("Ran model with flags: %s",
                              str(model_flags_dict))
                logging.error("Previously ran with flags: %s",
                              str(existing_flags))
                exit(1)
        else:
            # Write the file.
            with open(flags_json_path, "w") as fout:
                fout.write(json.dumps(model_flags_dict))

        target, device_fn = self.start_server_if_distributed()

        meta_filename = self.get_meta_filename(start_new_model, self.train_dir)

        with tf.Graph().as_default() as graph:
            if meta_filename:
                saver = self.recover_model(meta_filename)

            with tf.device(device_fn):
                if not meta_filename:
                    saver = self.build_model(self.model, self.reader)

                global_step = tf.get_collection("global_step")[0]
                loss = tf.get_collection("loss")[0]
                predictions = tf.get_collection("predictions")[0]
                labels = tf.get_collection("labels")[0]
                train_op = tf.get_collection("train_op")[0]
                init_op = tf.global_variables_initializer()

        sv = tf.train.Supervisor(graph,
                                 logdir=self.train_dir,
                                 init_op=init_op,
                                 is_chief=self.is_master,
                                 global_step=global_step,
                                 save_model_secs=15 * 60,
                                 save_summaries_secs=120,
                                 saver=saver)

        logging.info("%s: Starting managed session.",
                     task_as_string(self.task))
        with sv.managed_session(target, config=self.config) as sess:
            try:
                logging.info("%s: Entering training loop.",
                             task_as_string(self.task))
                while (not sv.should_stop()) and (not self.max_steps_reached):
                    batch_start_time = time.time()
                    _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                        [train_op, global_step, loss, predictions, labels])
                    seconds_per_batch = time.time() - batch_start_time
                    examples_per_second = labels_val.shape[
                        0] / seconds_per_batch

                    if self.max_steps and self.max_steps <= global_step_val:
                        self.max_steps_reached = True

                    if self.is_master and global_step_val % 10 == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time

                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second) +
                                     " | Hit@1: " + ("%.2f" % hit_at_one) +
                                     " PERR: " + ("%.2f" % perr) + " GAP: " +
                                     ("%.2f" % gap))

                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        sv.summary_writer.add_summary(
                            utils.MakeSummary("global_step/Examples/Second",
                                              examples_per_second),
                            global_step_val)
                        sv.summary_writer.flush()

                        # Exporting the model every x steps
                        time_to_export = (
                            (self.last_model_export_step == 0)
                            or (global_step_val - self.last_model_export_step
                                >= self.export_model_steps))

                        if self.is_master and time_to_export:
                            self.export_model(global_step_val, sv.saver,
                                              sv.save_path, sess)
                            self.last_model_export_step = global_step_val
                    else:
                        logging.info("training step " + str(global_step_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " Examples/sec: " +
                                     ("%.2f" % examples_per_second))
            except tf.errors.OutOfRangeError:
                logging.info("%s: Done training -- epoch limit reached.",
                             task_as_string(self.task))

        logging.info("%s: Exited training loop.", task_as_string(self.task))
        sv.Stop()
Ejemplo n.º 22
0
def train():
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        global_step = tf.Variable(0, trainable=False)
        video_id, labels, rgb, audio, num_frames = readers.input(True)
        coord = tf.train.Coordinator()

        lr = tf.train.exponential_decay(FLAGS.lr,
                                        global_step,
                                        FLAGS.decay_steps,
                                        FLAGS.learning_decay_rate,
                                        staircase=True)
        tf.summary.scalar('learning_rate', lr)
        opt = tf.train.AdamOptimizer(lr,
                                     beta1=FLAGS.beta1,
                                     beta2=FLAGS.beta2,
                                     epsilon=1e-08,
                                     use_locking=False,
                                     name='Adam')
        grads = inference(rgb, audio, num_frames, label=labels, train=True)
        loss = tf.get_collection("loss")[0]
        predict = tf.get_collection("predict")[0]

        tvars = tf.trainable_variables()

        for var in tvars:
            tf.summary.histogram(var.op.name, var)

        for grad, var in grads:
            print var.op.name
            if grad is not None and type(grad) is not tf.IndexedSlices:
                tf.summary.histogram(var.op.name + '/gradients', grad)
            elif type(grad) is tf.IndexedSlices:
                print "This is a indexslice gradient"
                print grad.dense_shape
            else:
                print "There is a None gradient"

        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.moving_average_decay, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        train_op = tf.group(apply_gradient_op, variables_averages_op)

        saver = tf.train.Saver(tf.global_variables())
        summary_op = tf.summary.merge_all()
        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())

        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.intra_op_parallelism_threads = 10
        config.inter_op_parallelism_threads = 16
        sess = tf.Session(config=config)
        sess.run(init)
        tf.train.start_queue_runners(sess=sess, coord=coord)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if not FLAGS.new_model:
            saver.restore(sess, ckpt.model_checkpoint_path)

        #loader.restore(sess, ckpt.model_checkpoint_path)

        for step in xrange(FLAGS.max_steps):

            start_time = time.time()
            _, loss_value, predict_value, labels_value, num_frames_value = sess.run(
                [train_op, loss, predict, labels, num_frames])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                hit_at_one = eval_util.calculate_hit_at_one(
                    predict_value, labels_value)
                perr = eval_util.calculate_precision_at_equal_recall_rate(
                    predict_value, labels_value)
                gap = eval_util.calculate_gap(predict_value, labels_value)

                format_str = (
                    '%s: step %d, loss = %.2f, hit@one = %.2f, perr = %.2f, gap = %.2f, (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str %
                      (datetime.now(), step, loss_value, hit_at_one, perr, gap,
                       examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
                summary_writer.add_summary(
                    utils.MakeSummary("Hit@1", hit_at_one), step)
                summary_writer.add_summary(utils.MakeSummary("Perr", perr),
                                           step)
                summary_writer.add_summary(utils.MakeSummary("Gap", gap), step)
                summary_writer.add_summary(
                    utils.MakeSummary("example per second", examples_per_sec),
                    step)

            if (step % 1000 == 0
                    and step != 0) or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)