Ejemplo n.º 1
0
def optimizing(optimizer, final_loss, clip_gradient_norm, global_step, prefix, scope):
  # Accumulate several batches before gradient descent options
  # to make larger batch than the memory could be able to hold

  tvs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
  # tvs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
  print "tvs", tvs

  if FLAGS.accumulate_gradients:
    assert FLAGS.apply_every_n_batches > 0, "apply_every_n_batches should be > 0"
    scale = 1.0 / FLAGS.apply_every_n_batches

    accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs] 
    init_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]
    gvs = optimizer.compute_gradients(final_loss, tvs)
    accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs)]

    if clip_gradient_norm > 0:
      with tf.name_scope('clip_grads'):
        clipped_accum_vars = utils.clip_variable_norms(accum_vars, 
                max_norm = clip_gradient_norm, scale = scale)
        apply_op = optimizer.apply_gradients([(clipped_accum_vars[i], gv[1]) 
                for i, gv in enumerate(gvs)], global_step=global_step)
        
    else:
        apply_op = optimizer.apply_gradients([(accum_vars[i] * scale, gv[1]) 
                for i, gv in enumerate(gvs)], global_step=global_step)
    tf.get_collection_ref(prefix + "_train/init_ops").extend(init_ops)
    tf.get_collection_ref(prefix + "_train/accum_ops").extend(accum_ops)
    tf.add_to_collection(prefix + "_train/apply_op", apply_op)

  # the original way, apply every batch
  else:
    gradients = optimizer.compute_gradients(final_loss,
        colocate_gradients_with_ops=False, var_list=tvs)
    print gradients
    if clip_gradient_norm > 0:
      with tf.name_scope('clip_grads'):
        gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm)
    train_op = optimizer.apply_gradients(gradients, global_step=global_step)
    tf.add_to_collection(prefix + "_train/train_op", train_op)
  return None
Ejemplo n.º 2
0
def build_graph(reader,
                model,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None):
    """Creates the Tensorflow graph.
  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.
  Args:
    reader: The data file reader. It should inherit from BaseReader.
    model: The core model (e.g. logistic or neural net). It should inherit
           from BaseModel.
    train_data_pattern: glob path to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
                from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
                            compared to the label loss.
    num_readers: How many threads to use for I/O operations.
    num_epochs: How many passes to make over the data. 'None' means an
                unlimited number of passes.
  """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    local_device_protos = device_lib.list_local_devices()
    gpus = [x.name for x in local_device_protos if x.device_type == 'GPU']
    gpus = gpus[:FLAGS.num_gpu]
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = '/gpu:%d'
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = '/cpu:%d'

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)
    unused_video_id, model_input_raw, labels_batch, num_frames = (
        get_input_data_tensors(reader,
                               train_data_pattern,
                               batch_size=batch_size * num_towers,
                               num_readers=num_readers,
                               num_epochs=num_epochs))
    tf.summary.histogram("model/input_raw", model_input_raw)

    feature_dim = len(model_input_raw.get_shape()) - 1

    model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)

    tower_inputs = tf.split(model_input, num_towers)
    tower_labels = tf.split(labels_batch, num_towers)
    tower_num_frames = tf.split(num_frames, num_towers)
    tower_gradients = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []
    for i in range(num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        with tf.device(device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
                with (slim.arg_scope(
                    [slim.model_variable, slim.variable],
                        device="/cpu:0" if num_gpus != 1 else "/gpu:0")):
                    result = model.create_model(tower_inputs[i],
                                                num_frames=tower_num_frames[i],
                                                vocab_size=reader.num_classes,
                                                labels=tower_labels[i])
                    for variable in slim.get_model_variables():
                        tf.summary.histogram(variable.op.name, variable)

                    predictions = result["predictions"]
                    tower_predictions.append(predictions)

                    if "loss" in result.keys():
                        label_loss = result["loss"]
                    else:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions, tower_labels[i])

                    if "regularization_loss" in result.keys():
                        reg_loss = result["regularization_loss"]
                    else:
                        reg_loss = tf.constant(0.0)

                    reg_losses = tf.losses.get_regularization_losses()
                    if reg_losses:
                        reg_loss += tf.add_n(reg_losses)

                    tower_reg_losses.append(reg_loss)

                    # Adds update_ops (e.g., moving average updates in batch normalization) as
                    # a dependency to the train_op.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if "update_ops" in result.keys():
                        update_ops += result["update_ops"]
                    if update_ops:
                        with tf.control_dependencies(update_ops):
                            barrier = tf.no_op(name="gradient_barrier")
                            with tf.control_dependencies([barrier]):
                                label_loss = tf.identity(label_loss)

                    tower_label_losses.append(label_loss)

                    # Incorporate the L2 weight penalties etc.
                    final_loss = regularization_penalty * reg_loss + label_loss
                    gradients = optimizer.compute_gradients(
                        final_loss, colocate_gradients_with_ops=False)
                    tower_gradients.append(gradients)
    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    tf.summary.scalar("label_loss", label_loss)
    if regularization_penalty != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)
    merged_gradients = utils.combine_gradients(tower_gradients)

    if clip_gradient_norm > 0:
        with tf.name_scope('clip_grads'):
            merged_gradients = utils.clip_gradient_norms(
                merged_gradients, clip_gradient_norm)

    train_op = optimizer.apply_gradients(merged_gradients,
                                         global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
    tf.add_to_collection("train_op", train_op)
Ejemplo n.º 3
0
def build_graph(all_readers,
                all_train_data_patterns,
                input_reader,
                input_data_pattern,
                model,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=256,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_epochs=None):
    """Creates the Tensorflow graph.

  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.

  Args:
    all_readers: The data file readers. Every element in it should inherit from BaseReader.
    model: The core model (e.g. logistic or neural net). It should inherit
           from BaseModel.
    train_data_patterns: glob paths to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
                from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
                            compared to the label loss.
    num_epochs: How many passes to make over the data. 'None' means an
                unlimited number of passes.
  """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    original_input = None
    if input_data_pattern is not None:
        original_video_id, original_input, unused_labels_batch, unused_num_frames = (
            get_input_data_tensors(input_reader,
                                   input_data_pattern,
                                   batch_size=batch_size,
                                   num_epochs=num_epochs))

    optimizer = optimizer_class(learning_rate)
    model_input_raw_tensors = []
    labels_batch_tensor = None
    for reader, data_pattern in zip(all_readers, all_train_data_patterns):
        video_id, model_input_raw, labels_batch, unused_num_frames = (
            get_input_data_tensors(reader,
                                   data_pattern,
                                   batch_size=batch_size,
                                   num_epochs=num_epochs))
        if labels_batch_tensor is None:
            labels_batch_tensor = labels_batch
        model_input_raw_tensors.append(tf.expand_dims(model_input_raw, axis=2))

        if original_input is not None:
            id_match = tf.ones_like(original_video_id, dtype=tf.float32)
            id_match = id_match * tf.cast(
                tf.equal(original_video_id, video_id), dtype=tf.float32)
            tf.summary.scalar("model/id_match", tf.reduce_mean(id_match))

    model_input = tf.concat(model_input_raw_tensors, axis=2)
    labels_batch = labels_batch_tensor
    tf.summary.histogram("model/input", model_input)

    with tf.name_scope("model"):
        if FLAGS.noise_level > 0:
            noise_level_tensor = tf.placeholder_with_default(
                0.0, shape=[], name="noise_level")
        else:
            noise_level_tensor = None

        if FLAGS.dropout:
            keep_prob_tensor = tf.placeholder_with_default(1.0,
                                                           shape=[],
                                                           name="keep_prob")
            result = model.create_model(model_input,
                                        labels=labels_batch,
                                        vocab_size=reader.num_classes,
                                        original_input=original_input,
                                        dropout=FLAGS.dropout,
                                        keep_prob=keep_prob_tensor,
                                        noise_level=noise_level_tensor)
        else:
            result = model.create_model(model_input,
                                        labels=labels_batch,
                                        vocab_size=reader.num_classes,
                                        original_input=original_input,
                                        noise_level=noise_level_tensor)

        for variable in slim.get_model_variables():
            tf.summary.histogram(variable.op.name, variable)

        predictions = result["predictions"]
        if "loss" in result.keys():
            label_loss = result["loss"]
        else:
            video_weights_batch = None
            if FLAGS.reweight:
                video_weights_batch = get_video_weights(video_id)
            else:
                video_weights_batch = None

            if FLAGS.multitask:
                print "using multitask loss"
                support_predictions = result["support_predictions"]
                tf.summary.histogram("model/support_predictions",
                                     support_predictions)
                print "support_predictions", support_predictions
                label_loss = label_loss_fn.calculate_loss(
                    predictions,
                    support_predictions,
                    labels_batch,
                    weights=video_weights_batch)
            else:
                print "using original loss"
                label_loss = label_loss_fn.calculate_loss(
                    predictions, labels_batch, weights=video_weights_batch)

        tf.summary.histogram("model/predictions", predictions)
        tf.summary.scalar("label_loss", label_loss)

        if "regularization_loss" in result.keys():
            reg_loss = result["regularization_loss"]
        else:
            reg_loss = tf.constant(0.0)

        reg_losses = tf.losses.get_regularization_losses()
        if reg_losses:
            reg_loss += tf.add_n(reg_losses)

        if regularization_penalty != 0:
            tf.summary.scalar("reg_loss", reg_loss)

        # Adds update_ops (e.g., moving average updates in batch normalization) as
        # a dependency to the train_op.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if "update_ops" in result.keys():
            update_ops += result["update_ops"]
        if update_ops:
            with tf.control_dependencies(update_ops):
                barrier = tf.no_op(name="gradient_barrier")
                with tf.control_dependencies([barrier]):
                    label_loss = tf.identity(label_loss)

        # Incorporate the L2 weight penalties etc.
        final_loss = regularization_penalty * reg_loss + label_loss

        if FLAGS.training:
            gradients = optimizer.compute_gradients(
                final_loss, colocate_gradients_with_ops=False)
            if clip_gradient_norm > 0:
                with tf.name_scope('clip_grads'):
                    gradients = utils.clip_gradient_norms(
                        gradients, clip_gradient_norm)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
        else:
            train_op = tf.no_op()

        tf.add_to_collection("global_step", global_step)
        tf.add_to_collection("loss", label_loss)
        tf.add_to_collection("predictions", predictions)
        tf.add_to_collection("input_batch_raw", model_input)
        tf.add_to_collection("input_batch", model_input)
        tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
        tf.add_to_collection("train_op", train_op)
        if FLAGS.dropout:
            tf.add_to_collection("keep_prob", keep_prob_tensor)
        if FLAGS.noise_level > 0:
            tf.add_to_collection("noise_level", noise_level_tensor)
Ejemplo n.º 4
0
def build_graph(reader,
                model,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                transformer_class=feature_transform.DefaultTransformer,
                augmenter_class=data_augmentation.DefaultAugmenter,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None):
    """Creates the Tensorflow graph.

  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.

  Args:
    reader: The data file reader. It should inherit from BaseReader.
    model: The core model (e.g. logistic or neural net). It should inherit
           from BaseModel.
    train_data_pattern: glob path to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
                from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
                            compared to the label loss.
    num_readers: How many threads to use for I/O operations.
    num_epochs: How many passes to make over the data. 'None' means an
                unlimited number of passes.
  """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)
    if FLAGS.distillation_features:
        video_id, model_input_raw, labels_batch, num_frames, distill_labels_batch = (
            get_input_data_tensors(reader,
                                   train_data_pattern,
                                   batch_size=batch_size,
                                   num_readers=num_readers,
                                   num_epochs=num_epochs))
        if FLAGS.distillation_features and FLAGS.distillation_type == 2:
            p = FLAGS.distillation_percent
            print "distillation_percent =", p, "reforming labels"
            float_labels = tf.cast(labels_batch, dtype=tf.float32)
            sum_float_labels = tf.reduce_sum(float_labels,
                                             axis=1,
                                             keep_dims=True)
            sum_distill_labels = tf.reduce_sum(
                distill_labels_batch, axis=1, keep_dims=True) + 1e-6
            distill_labels_batch = float_labels + distill_labels_batch * (
                sum_float_labels / sum_distill_labels * p)
            distill_labels_batch = tf.clip_by_value(distill_labels_batch,
                                                    clip_value_min=0.0,
                                                    clip_value_max=1.0)
    else:
        video_id, model_input_raw, labels_batch, num_frames = (
            get_input_data_tensors(reader,
                                   train_data_pattern,
                                   batch_size=batch_size,
                                   num_readers=num_readers,
                                   num_epochs=num_epochs))

    # data augmentation, will not persist in inference
    data_augmenter = augmenter_class()
    model_input_raw, labels_batch, num_frames = data_augmenter.augment(
        model_input_raw, num_frames=num_frames, labels_batch=labels_batch)

    tf.summary.histogram("model/input_raw", model_input_raw)

    feature_transformer = transformer_class()
    model_input, num_frames = feature_transformer.transform(
        model_input_raw, num_frames=num_frames)

    tf.summary.histogram("model/input", model_input)

    with tf.name_scope("model"):
        if FLAGS.noise_level > 0:
            noise_level_tensor = tf.placeholder_with_default(
                0.0, shape=[], name="noise_level")
        else:
            noise_level_tensor = None

        if FLAGS.distillation_as_input:
            distillation_predictions = distill_labels_batch
        else:
            distillation_predictions = None

        if FLAGS.dropout:
            keep_prob_tensor = tf.placeholder_with_default(1.0,
                                                           shape=[],
                                                           name="keep_prob")
            result = model.create_model(
                model_input,
                num_frames=num_frames,
                vocab_size=reader.num_classes,
                labels=labels_batch,
                dropout=FLAGS.dropout,
                keep_prob=keep_prob_tensor,
                distillation_predictions=distillation_predictions,
                noise_level=noise_level_tensor)
        else:
            result = model.create_model(
                model_input,
                num_frames=num_frames,
                vocab_size=reader.num_classes,
                labels=labels_batch,
                distillation_predictions=distillation_predictions,
                noise_level=noise_level_tensor)

        for variable in slim.get_model_variables():
            tf.summary.histogram(variable.op.name, variable)

        print "result", result
        predictions = result["predictions"]
        if "loss" in result.keys():
            label_loss = result["loss"]
        else:
            video_weights_batch = None
            if FLAGS.reweight:
                video_weights_batch = get_video_weights(video_id)

            if FLAGS.distillation_as_boosting:
                video_weights_batch = get_weights_by_predictions(
                    labels_batch, distillation_predictions)

            if FLAGS.multitask:
                support_predictions = result["support_predictions"]
                tf.summary.histogram("model/support_predictions",
                                     support_predictions)
                print "support_predictions", support_predictions
                if FLAGS.distillation_features and FLAGS.distillation_type == 1:
                    p = FLAGS.distillation_percent
                    print "distillation_percent =", p
                    if p <= 0:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions,
                            support_predictions,
                            labels_batch,
                            weights=video_weights_batch)
                    elif p >= 1:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions,
                            support_predictions,
                            distill_labels_batch,
                            weights=video_weights_batch)
                    else:
                        label_loss = label_loss_fn.calculate_loss(predictions, support_predictions, labels_batch, weights=video_weights_batch) * (1.0 - p) \
                                    + label_loss_fn.calculate_loss(predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) * p
                elif FLAGS.distillation_features and FLAGS.distillation_type == 2:
                    print "using pure distillation loss"
                    label_loss = label_loss_fn.calculate_loss(
                        predictions,
                        support_predictions,
                        distill_labels_batch,
                        weights=video_weights_batch)
                else:
                    print "using original loss"
                    label_loss = label_loss_fn.calculate_loss(
                        predictions,
                        support_predictions,
                        labels_batch,
                        weights=video_weights_batch)
            else:
                if FLAGS.distillation_features and FLAGS.distillation_type == 1:
                    p = FLAGS.distillation_percent
                    print "distillation_percent =", p
                    if p <= 0:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions,
                            labels_batch,
                            weights=video_weights_batch)
                    elif p >= 1:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions,
                            distill_labels_batch,
                            weights=video_weights_batch)
                    else:
                        label_loss = label_loss_fn.calculate_loss(predictions, labels_batch, weights=video_weights_batch) * (1.0 - p) \
                                     + label_loss_fn.calculate_loss(predictions, distill_labels_batch, weights=video_weights_batch) * p
                elif FLAGS.distillation_features and FLAGS.distillation_type == 2:
                    print "using pure distillation loss"
                    label_loss = label_loss_fn.calculate_loss(
                        predictions,
                        distill_labels_batch,
                        weights=video_weights_batch)
                else:
                    print "using original loss"
                    label_loss = label_loss_fn.calculate_loss(
                        predictions, labels_batch, weights=video_weights_batch)

        tf.summary.histogram("model/predictions", predictions)
        tf.summary.scalar("label_loss", label_loss)

        if "regularization_loss" in result.keys():
            reg_loss = result["regularization_loss"]
        else:
            reg_loss = tf.constant(0.0)

        reg_losses = tf.losses.get_regularization_losses()
        if reg_losses:
            reg_loss += tf.add_n(reg_losses)

        if regularization_penalty != 0:
            tf.summary.scalar("reg_loss", reg_loss)

        # Adds update_ops (e.g., moving average updates in batch normalization) as
        # a dependency to the train_op.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if "update_ops" in result.keys():
            update_ops += result["update_ops"]
        if update_ops:
            with tf.control_dependencies(update_ops):
                barrier = tf.no_op(name="gradient_barrier")
                with tf.control_dependencies([barrier]):
                    label_loss = tf.identity(label_loss)

        # Incorporate the L2 weight penalties etc.
        final_loss = regularization_penalty * reg_loss + label_loss

        gradients = optimizer.compute_gradients(
            final_loss, colocate_gradients_with_ops=False)
        if clip_gradient_norm > 0:
            with tf.name_scope('clip_grads'):
                gradients = utils.clip_gradient_norms(gradients,
                                                      clip_gradient_norm)
        train_op = optimizer.apply_gradients(gradients,
                                             global_step=global_step)

        tf.add_to_collection("global_step", global_step)
        tf.add_to_collection("loss", label_loss)
        tf.add_to_collection("predictions", predictions)
        tf.add_to_collection("input_batch_raw", model_input_raw)
        tf.add_to_collection("input_batch", model_input)
        tf.add_to_collection("num_frames", num_frames)
        tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
        tf.add_to_collection("train_op", train_op)
        if FLAGS.dropout:
            tf.add_to_collection("keep_prob", keep_prob_tensor)
        if FLAGS.noise_level > 0:
            tf.add_to_collection("noise_level", noise_level_tensor)
Ejemplo n.º 5
0
def build_graph(reader,
                model,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.compat.v1.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None):
    """Creates the Tensorflow graph.

  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.

  Args:
    reader: The data file reader. It should inherit from BaseReader.
    model: The core model (e.g. logistic or neural net). It should inherit from
      BaseModel.
    train_data_pattern: glob path to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
      from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
      compared to the label loss.
    num_readers: How many threads to use for I/O operations.
    num_epochs: How many passes to make over the data. 'None' means an unlimited
      number of passes.
  """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    local_device_protos = device_lib.list_local_devices()
    gpus = [x.name for x in local_device_protos if x.device_type == "GPU"]
    gpus = gpus[:FLAGS.num_gpu]
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = "/gpu:%d"
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = "/cpu:%d"

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar("learning_rate", learning_rate)

    optimizer = optimizer_class(learning_rate)
    input_data_dict = (get_input_data_tensors(reader,
                                              train_data_pattern,
                                              batch_size=batch_size *
                                              num_towers,
                                              num_readers=num_readers,
                                              num_epochs=num_epochs))
    print('input_data_dict', input_data_dict)
    model_input_raw = input_data_dict["video_matrix"]
    labels_batch = input_data_dict["labels"]
    num_frames = input_data_dict["num_frames"]
    print("model_input_shape, ", model_input_raw.shape)
    print("labels_batch, ", labels_batch)

    import csv
    import urllib3
    import numpy as np
    import pandas as pd
    whitelisted_cls_mask = np.zeros((3862, ), dtype=np.float32)
    url = pd.read_csv('segment_label_ids.csv')
    # response = urllib2.urlopen(url)
    for line in url:
        try:
            cls_id = int(line[0])
            whitelisted_cls_mask[cls_id] = 1.
        except ValueError:
            # Simply skip the non-integer line.
            continue
    #response.close()

    # url2 = 'http://storage.googleapis.com/youtube8m-lijun-mlengine/classCount.csv'
    # response2 = urllib2.urlopen(url2)
    # fobj2 = csv.reader(response2)
    # for line in fobj2:
    #   try:
    #     cls_id = int(line[0])
    #     whitelisted_cls_mask[cls_id] = (15-np.log(int(line[1])))**2
    #   except ValueError:
    #       # Simply skip the non-integer line.
    #     continue
    # response2.close()
    # select=tf.matmul(tf.cast(labels_batch, tf.float32),tf.reshape(whitelisted_cls_mask,[3862,1]))>0
    # select=tf.squeeze(select)
    # model_input_raw = model_input_raw[select,:,:]
    # labels_batch = labels_batch[select,:]
    # num_frames = num_frames[select]

    tf.summary.histogram("model/input_raw", model_input_raw)
    feature_dim = len(model_input_raw.get_shape()) - 1
    model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)

    tower_inputs = tf.split(model_input, num_towers)
    tower_labels = tf.split(labels_batch, num_towers)
    tower_num_frames = tf.split(num_frames, num_towers)
    tower_gradients = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []

    # import csv
    # import urllib2
    # import numpy as np
    # whitelisted_cls_mask = np.zeros((3862,),
    #                               dtype=np.float32)
    # url = 'http://storage.googleapis.com/youtube8m-lijun-mlengine/segment_label_ids.csv'
    # response = urllib2.urlopen(url)
    # fobj = csv.reader(response)
    # for line in fobj:
    #   try:
    #     cls_id = int(line[0])
    #     whitelisted_cls_mask[cls_id] = 1.
    #   except ValueError:
    #       # Simply skip the non-integer line.
    #     continue
    # response.close()
    # whitelisted_cls_mask=whitelisted_cls_mask+np.ones((3862,),dtype=np.float32)
    whitelisted_cls_mask = whitelisted_cls_mask * 4 + np.ones(
        (3862, ), dtype=np.float32)
    # whitelisted_cls_mask=0.05*(whitelisted_cls_mask*99+np.ones((3862,),dtype=np.float32))
    # print('whitelisted_cls_mask',np.amin(whitelisted_cls_mask))
    for i in range(num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.f
        with tf.device(device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else
                                    None)):  # reuse=True if i > 0 else None
                with (slim.arg_scope(
                    [slim.model_variable, slim.variable],
                        device="/cpu:0" if num_gpus != 1 else "/gpu:0")):
                    result = model.create_model(tower_inputs[i],
                                                num_frames=tower_num_frames[i],
                                                vocab_size=reader.num_classes,
                                                labels=tower_labels[i])
                    for variable in slim.get_model_variables():
                        tf.summary.histogram(variable.op.name, variable)
                    # print('result predictions',result["predictions"])
                    predictions = result["predictions"]
                    tower_predictions.append(predictions)

                    if "loss" in result.keys():
                        label_loss = result["loss"]
                    else:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions,
                            tower_labels[i],
                            label_weights=whitelisted_cls_mask)
                        if "aux_predictions" in result.keys():
                            for pred in result["aux_predictions"]:
                                label_loss += label_loss_fn.calculate_loss(
                                    pred,
                                    tower_labels[i],
                                    label_weights=whitelisted_cls_mask)
                    # print('label_loss',label_loss)
                    if "regularization_loss" in result.keys():
                        reg_loss = result["regularization_loss"]
                    else:
                        reg_loss = tf.constant(0.0)

                    reg_losses = tf.losses.get_regularization_losses()
                    if reg_losses:
                        reg_loss += tf.add_n(reg_losses)

                    tower_reg_losses.append(reg_loss)

                    # Adds update_ops (e.g., moving average updates in batch normalization) as
                    # a dependency to the train_op.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if "update_ops" in result.keys():
                        update_ops += result["update_ops"]
                    if update_ops:
                        with tf.control_dependencies(update_ops):
                            barrier = tf.no_op(name="gradient_barrier")
                            with tf.control_dependencies([barrier]):
                                label_loss = tf.identity(label_loss)

                    tower_label_losses.append(label_loss)

                    # Incorporate the L2 weight penalties etc.
                    final_loss = regularization_penalty * reg_loss + label_loss
                    gradients = optimizer.compute_gradients(
                        final_loss, colocate_gradients_with_ops=False)
                    tower_gradients.append(gradients)
    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    tf.summary.scalar("label_loss", label_loss)
    if regularization_penalty != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)
    merged_gradients = utils.combine_gradients(tower_gradients)

    if clip_gradient_norm > 0:
        with tf.name_scope("clip_grads"):
            merged_gradients = utils.clip_gradient_norms(
                merged_gradients, clip_gradient_norm)

    train_op = optimizer.apply_gradients(merged_gradients,
                                         global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
    tf.add_to_collection("train_op", train_op)
Ejemplo n.º 6
0
def build_graph(reader,
                model,
                train_data_pattern,
                train_data_pattern2,
                train_data_pattern3,
                eval_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None,
                l2_penalty=1e-8,
                gpu_only=1):
    """Creates the Tensorflow graph.

  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.

  Args:
    reader: The data file reader. It should inherit from BaseReader.
    model: The core model (e.g. logistic or neural net). It should inherit
           from BaseModel.
    train_data_pattern: glob path to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
                from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
                            compared to the label loss.
    num_readers: How many threads to use for I/O operations.
    num_epochs: How many passes to make over the data. 'None' means an
                unlimited number of passes.
  """
    # data files
    files1 = gfile.Glob(train_data_pattern)
    files2 = gfile.Glob(train_data_pattern2)
    files3 = gfile.Glob(train_data_pattern3)
    files = files1 + files2 + files3
    if not files:
        raise IOError("Unable to find training files. data_pattern='" +
                      data_pattern + "'.")
    logging.info("Total number of training files: %s + %s + %s =  %s.",
                 str(len(files1)), str(len(files2)), str(len(files3)),
                 str(len(files)))

    files4 = gfile.Glob(eval_data_pattern)
    logging.info("Total number of eval files: %s.", str(len(files4)))

    if FLAGS.fold == -1:
        validate_files = files4
        train_files = files
    else:
        validate_files = files[FLAGS.fold::5]
        train_files = [x for x in files if x not in validate_files]

    logging.info("train files: {}, first is: {}.".format(
        len(train_files), train_files[0].split('/')[-1]))
    logging.info("eval files: {}, first is: {}.".format(
        len(validate_files), validate_files[0].split('/')[-1]))

    # label weights for loss function. ugly hard coded for now.
    wgts_np = np.ones(FLAGS.truncated_num_classes)
    over_weight_labels = False
    if over_weight_labels:
        labels_to_overwgt = [
            38, 47, 49, 55, 72, 76, 86, 89, 93, 94, 95, 98, 99, 101, 102, 110,
            111, 113, 114, 115, 120, 121
        ]
        wgts_np[labels_to_overwgt] = 2.0
    wgts_4_lossfn = tf.constant(wgts_np, dtype=tf.float32)

    global_step = tf.Variable(0, trainable=False, name="global_step")
    restart_learning_rate = tf.Variable(base_learning_rate,
                                        trainable=False,
                                        name="restart_learning_rate")

    local_device_protos = device_lib.list_local_devices()
    gpus = [x.name for x in local_device_protos if x.device_type == 'GPU']
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = '/gpu:%d'
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = '/cpu:%d'

    learning_rate = tf.train.exponential_decay(restart_learning_rate,
                                               global_step * batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)
    unused_video_id, model_input_raw, labels_batch, num_frames = (
        get_input_data_tensors(reader,
                               train_files,
                               batch_size=batch_size * num_towers,
                               num_readers=num_readers,
                               num_epochs=num_epochs))
    tf.summary.histogram("model/input_raw", model_input_raw)

    # model params
    # probabilities for keeping a neuron in a layer, assuming max 10 layers, below default value
    with tf.variable_scope("tower", reuse=True) as scope:
        layers_keep_probs = tf.Variable(
            [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            trainable=False,
            name="layers_keep_probs")
    model_input = model_input_raw
    if FLAGS.apply_global_normalization:
        g_mean, g_std = model_utils.load_global_moments()
        g_inv_std = 1.0 / g_std
        global_mean = tf.constant(g_mean, dtype=tf.float32)
        # expand global mean to match new dimension and fill rest with zeros
        new_dim = tf.cast(model_input_raw.shape[1], tf.int32)
        zero_padding = tf.zeros(new_dim - tf.shape(global_mean), tf.float32)
        global_mean_padded = tf.concat([global_mean, zero_padding], 0)
        # expand global inv std to match new dimension and fill rest with ones
        global_inv_std = tf.constant(g_inv_std, dtype=tf.float32)
        one_padding = tf.ones(new_dim - tf.shape(global_inv_std), tf.float32)
        global_inv_std_padded = tf.concat([global_inv_std, one_padding], 0)
        # apply normalizations (can do both) if requested
        # global L2 normalization
        model_input = tf.multiply(tf.subtract(model_input, global_mean_padded),
                                  global_inv_std_padded)
    # regular L2 normalization
    if FLAGS.apply_batch_l2_normalization:
        feature_dim = len(model_input.get_shape()) - 1
        model_input = tf.nn.l2_normalize(model_input, feature_dim)

    tower_inputs = tf.split(model_input, num_towers)
    tower_labels = tf.split(labels_batch, num_towers)
    tower_num_frames = tf.split(num_frames, num_towers)
    tower_gradients = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []

    # eval graph - to monitor performance out of sample during training
    e_video_id, e_input_raw, e_labels_batch, e_num_frames = (
        get_input_data_tensors(reader,
                               validate_files,
                               batch_size=batch_size * num_towers,
                               num_readers=num_readers,
                               num_epochs=2 * num_epochs))
    e_input = e_input_raw
    if FLAGS.apply_global_normalization:
        e_input = tf.multiply(tf.subtract(e_input, global_mean_padded),
                              global_inv_std_padded)
    if FLAGS.apply_batch_l2_normalization:
        feature_dim = len(model_input.get_shape()) - 1
        e_input = tf.nn.l2_normalize(e_input, feature_dim)

    e_tower_inputs = tf.split(e_input, num_towers)
    e_tower_labels = tf.split(e_labels_batch, num_towers)
    e_tower_num_frames = tf.split(e_num_frames, num_towers)
    e_tower_predictions = []
    e_tower_layers_keep_probs = tf.Variable(
        [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
        trainable=False,
        name="layers_keep_probs")
    logging.info(e_tower_inputs)
    # end eval
    for i in range(num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        logging.info('For tower: ' + str(i))
        with tf.device(device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
                with (slim.arg_scope(
                    [slim.model_variable, slim.variable],
                        device="/cpu:0" if num_gpus != 1 else "/gpu:0")):
                    logging.info(layers_keep_probs)
                    result = model.create_model(
                        tower_inputs[i],
                        num_frames=tower_num_frames[i],
                        vocab_size=reader.num_classes,
                        labels=tower_labels[i],
                        layers_keep_probs=layers_keep_probs,
                        l2_penalty=l2_penalty,
                        is_training=True)
                    for variable in slim.get_model_variables():
                        logging.info(variable)
                        tf.summary.histogram(variable.op.name, variable)

                    # create shadow moving average model variables
                    if FLAGS.use_ema == True:
                        model_vars = [x for x in slim.get_model_variables()]
                        ema = tf.train.ExponentialMovingAverage(
                            decay=1.0 - 1.0 / FLAGS.ema_halflife)
                        ema_op = ema.apply(model_vars)
                        logging.info("model_vars:")
                        logging.info(" || ".join([str(x) for x in model_vars]))
                        ema_vars = [ema.average(x) for x in model_vars]
                        ema_vars_pair_dict = {
                            ema.average_name(x): x.op.name
                            for x in model_vars
                        }
                        logging.info("ema_vars_pair_dict:")
                        for x, y in ema_vars_pair_dict.items():
                            logging.info(x + ': ' + y)
                        for v in ema_vars:
                            tf.summary.histogram(v.op.name, v)
                        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op)
                        tf.add_to_collection("ema_op", ema_op)

                    predictions = result["predictions"]
                    tower_predictions.append(predictions)

                    if "loss" in result.keys():
                        label_loss = result["loss"]
                    else:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions, tower_labels[i], FLAGS.loss_epsilon)

                    if "regularization_loss" in result.keys():
                        reg_loss = result["regularization_loss"]
                    else:
                        reg_loss = tf.constant(0.0)

                    reg_losses = tf.losses.get_regularization_losses()
                    if reg_losses:
                        reg_loss += tf.add_n(reg_losses)

                    tower_reg_losses.append(reg_loss)

                    # Adds update_ops (e.g., moving average updates in batch normalization) as
                    # a dependency to the train_op.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if "update_ops" in result.keys():
                        update_ops += result["update_ops"]
                    if update_ops:
                        with tf.control_dependencies(update_ops):
                            barrier = tf.no_op(name="gradient_barrier")
                            with tf.control_dependencies([barrier]):
                                label_loss = tf.identity(label_loss)

                    tower_label_losses.append(label_loss)

                    # Incorporate the L2 weight penalties etc.
                    final_loss = regularization_penalty * reg_loss + label_loss
                    gradients = optimizer.compute_gradients(
                        final_loss, colocate_gradients_with_ops=False)
                    tower_gradients.append(gradients)

                    # eval ops
                    logging.info("eval ops")
                    e_result = model.create_model(
                        e_tower_inputs[i],
                        num_frames=e_tower_num_frames[i],
                        vocab_size=reader.num_classes,
                        labels=e_tower_labels[i],
                        layers_keep_probs=
                        e_tower_layers_keep_probs,  #tf.Variable([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], tf.float32, name="layers_keep_probs")
                        l2_penalty=l2_penalty,
                        is_training=False)

                    e_predictions = e_result["predictions"]
                    e_tower_predictions.append(e_predictions)
                    # end eval ops

    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    tf.summary.scalar("label_loss", label_loss)
    if regularization_penalty != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)
    merged_gradients = utils.combine_gradients(tower_gradients)

    if clip_gradient_norm > 0:
        with tf.name_scope('clip_grads'):
            merged_gradients = utils.clip_gradient_norms(
                merged_gradients, clip_gradient_norm)

    train_op = optimizer.apply_gradients(merged_gradients,
                                         global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("restart_learning_rate", restart_learning_rate)
    tf.add_to_collection("layers_keep_probs", layers_keep_probs)
    tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
    tf.add_to_collection("train_op", train_op)
    #tf.add_to_collection("ema_op", ema_op)

    # add eval graph
    e_label_loss = label_loss_fn.calculate_loss(
        tf.concat(e_tower_predictions, 0), e_labels_batch, FLAGS.loss_epsilon)
    tf.summary.scalar("e_label_loss", e_label_loss)

    tf.add_to_collection("e_predictions", tf.concat(e_tower_predictions, 0))
    tf.add_to_collection("e_labels", tf.cast(e_labels_batch, tf.float32))
    tf.add_to_collection("e_loss", e_label_loss)
Ejemplo n.º 7
0
def model_fn(features, labels, mode, params):

    is_training = mode == learn.ModeKeys.TRAIN
    optimizer_class = find_class_by_name(params.optimizer, [tf.train])
    label_loss_fn = find_class_by_name(params.label_loss, [losses])()
    model = find_class_by_name(params.model,
                               [frame_level_models, video_level_models])()

    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.exponential_decay(
        params.base_learning_rate,
        global_step * params.batch_size * params.num_towers,
        params.learning_rate_decay_examples,
        params.learning_rate_decay,
        staircase=True,
    )

    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)

    tf.summary.histogram("model/input_raw", features['model_input'])

    feature_dim = len(features['model_input'].get_shape()) - 1

    model_input = tf.nn.l2_normalize(features['model_input'], feature_dim)

    tower_inputs = tf.split(model_input, params.num_towers)

    if mode == learn.ModeKeys.INFER:
        # ***
        #  this is a quick hack so that the existing model_fn code,
        #  taken from train.py, doesn't break in inference (or serving) mode.
        #  Normally, we would write model_fn such that the 'labels' input arg
        #  can be None in inference mode, but this existing model code was not written this
        #  way.  See the serving_input_fn() defined below, to see where 'labels_batch'
        # is added to the features dict, just to make this code work properly
        labels = features['labels_batch']

    tower_labels = tf.split(labels, params.num_towers)

    tower_num_frames = tf.split(features['num_frames'], params.num_towers)
    tower_gradients = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []

    for i in range(params.num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        with tf.device(params.device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
                with (slim.arg_scope([slim.model_variable, slim.variable],
                                     device="/cpu:0"
                                     if params.num_gpus != 1 else "/gpu:0")):
                    result = model.create_model(
                        tower_inputs[i],
                        num_frames=tower_num_frames[i],
                        vocab_size=params.reader.num_classes,
                        labels=tower_labels[i],
                        is_training=is_training)
                    for variable in slim.get_model_variables():
                        tf.summary.histogram(variable.op.name, variable)

                    predictions = result["predictions"]

                    tower_predictions.append(predictions)

                    if "loss" in result.keys():
                        label_loss = result["loss"]
                    else:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions, tower_labels[i])

                    if "regularization_loss" in result.keys():
                        reg_loss = result["regularization_loss"]
                    else:
                        reg_loss = tf.constant(0.0)

                    reg_losses = tf.losses.get_regularization_losses()
                    if reg_losses:
                        reg_loss += tf.add_n(reg_losses)

                    tower_reg_losses.append(reg_loss)

                    # Adds update_ops (e.g., moving average updates in batch normalization) as
                    # a dependency to the train_op.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if "update_ops" in result.keys():
                        update_ops += result["update_ops"]
                    if update_ops:
                        with tf.control_dependencies(update_ops):
                            barrier = tf.no_op(name="gradient_barrier")
                            with tf.control_dependencies([barrier]):
                                label_loss = tf.identity(label_loss)

                    tower_label_losses.append(label_loss)

                    final_loss = params.regularization_penalty * reg_loss + label_loss
                    gradients = optimizer.compute_gradients(
                        final_loss, colocate_gradients_with_ops=False)
                    tower_gradients.append(gradients)

    pred_dict = {}
    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    predictions = tf.concat(tower_predictions, 0)
    pred_dict['predictions'] = predictions
    tf.summary.scalar("label_loss", label_loss)
    if params.regularization_penalty != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)

    if is_training:
        # Incorporate the L2 weight penalties, etc.

        merged_gradients = utils.combine_gradients(tower_gradients)
        if params.clip_gradient_norm > 0:
            with tf.name_scope('clip_grads'):
                merged_gradients = utils.clip_gradient_norms(
                    merged_gradients, params.clip_gradient_norm)
        train_op = optimizer.apply_gradients(merged_gradients,
                                             global_step=global_step)
    else:
        train_op = None

    eval_metric_ops = {}
    if mode == learn.ModeKeys.EVAL or is_training:

        eval_metric_ops['hit_at_one'] = metrics.streaming_mean(
            tf.py_func(
                lambda x, y: np.float32(eval_util.calculate_hit_at_one(x, y)),
                [predictions, labels],
                tf.float32,
                stateful=False,
            ))
        eval_metric_ops['perr'] = metrics.streaming_mean(
            tf.py_func(
                lambda x, y: np.float32(
                    eval_util.calculate_precision_at_equal_recall_rate(x, y)),
                [predictions, labels],
                tf.float32,
                stateful=False,
            ))
        eval_metric_ops['gap'] = metrics.streaming_mean(
            tf.py_func(
                lambda x, y: np.float32(eval_util.calculate_gap(x, y)),
                [predictions, labels],
                tf.float32,
                stateful=False,
            ))

    else:
        pass
    top_predictions, top_indices = tf.nn.top_k(predictions,
                                               _TOP_PREDICTIONS_IN_OUTPUT)

    pred_dict['top_predictions'] = top_predictions
    pred_dict['top_indices'] = top_indices

    #add eval summaries and update ops for training
    for key, val in eval_metric_ops.items():
        tf.summary.scalar(key, val[0])  #create summary for each eval op
        tf.add_to_collection(
            tf.GraphKeys.UPDATE_OPS, val[1]
        )  # add the update op for each eval up to update ops collection, so that it will be run every train_op call

    #  tf.add_to_collection("global_step", global_step)
    #  tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    #  tf.add_to_collection("input_batch_raw", model_input_raw)
    #  tf.add_to_collection("input_batch", model_input)
    #  tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels, tf.float32))
    #  tf.add_to_collection("train_op", train_op)
    tf.summary.scalar("loss", label_loss)

    export_outputs = {
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
        tf.estimator.export.PredictOutput(pred_dict)
    }

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=pred_dict,
                                      loss=label_loss,
                                      train_op=train_op,
                                      export_outputs=export_outputs,
                                      eval_metric_ops=eval_metric_ops)
Ejemplo n.º 8
0
def build_graph(reader,
                generator_model,
                discriminator_model,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None):
    """Creates the Tensorflow graph.

  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.

  Args:
    reader: The data file reader. It should inherit from BaseReader.
    generator_model: The core model for generator. It should inherit from
                     BaseModel.
    discriminator_model: The core model for discriminator. It should inherit from
                         BaseModel.
    train_data_pattern: glob path to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
                from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
                            compared to the label loss.
    num_readers: How many threads to use for I/O operations.
    num_epochs: How many passes to make over the data. 'None' means an
                unlimited number of passes.
  """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    gpus = get_gpus()
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = '/gpu:%d'
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = '/cpu:%d'

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)

    model_input_raw, _ = (get_input_data_tensors(reader,
                                                 train_data_pattern,
                                                 batch_size=batch_size *
                                                 num_towers,
                                                 num_readers=num_readers,
                                                 num_epochs=num_epochs))
    tf.summary.histogram("model/input_raw", model_input_raw)
    model_input = model_input_raw

    noise_input = tf.placeholder(
        tf.float32, shape=[None, random_noise_generator.get_dim()])

    image_width, image_height = reader.get_image_size()

    tower_inputs = tf.split(model_input, num_towers)
    tower_noise_input = tf.split(noise_input, num_towers)
    tower_D_gradients = []
    tower_G_gradients = []
    tower_generated_images = []
    tower_predictions_for_fake = []
    tower_predictions_for_real = []
    tower_D_losses = []
    tower_G_losses = []

    for i in range(num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        with tf.device(device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
                with (slim.arg_scope(
                    [slim.model_variable, slim.variable],
                        device="/cpu:0" if num_gpus != 1 else "/gpu:0")):
                    generator_model.create_model(image_width * image_height)
                    discriminator_model.create_model(image_width *
                                                     image_height)

                    generated_result = generator_model.run_model(
                        tower_noise_input[i])
                    generated_images = generated_result["output"]

                    generated_images_shaped = tf.reshape(
                        generated_images, [-1, image_height, image_width, 1])
                    tf.summary.image('generated_images',
                                     generated_images_shaped, 10)
                    tower_generated_images.append(generated_images)

                    result_from_fake = discriminator_model.run_model(
                        generated_images)
                    result_from_real = discriminator_model.run_model(
                        tower_inputs[i])
                    for variable in slim.get_model_variables():
                        tf.summary.histogram(variable.op.name, variable)

                    predictions_for_fake = result_from_fake["predictions"]
                    predictions_for_real = result_from_real["predictions"]
                    tower_predictions_for_fake.append(predictions_for_fake)
                    tower_predictions_for_real.append(predictions_for_real)

                    logits_for_fake = result_from_fake["logits"]
                    logits_for_real = result_from_real["logits"]
                    D_loss_fake = label_loss_fn.calculate_loss(
                        logits_for_fake, tf.zeros_like(logits_for_fake))
                    D_loss_real = label_loss_fn.calculate_loss(
                        logits_for_real, tf.ones_like(logits_for_real))
                    D_loss = D_loss_fake + D_loss_real
                    tower_D_losses.append(D_loss)

                    G_loss = label_loss_fn.calculate_loss(
                        logits_for_fake, tf.ones_like(logits_for_fake))
                    tower_G_losses.append(G_loss)

                    D_var = discriminator_model.get_variables()
                    D_gradients = optimizer.compute_gradients(D_loss,
                                                              var_list=D_var)
                    tower_D_gradients.append(D_gradients)

                    G_var = generator_model.get_variables()
                    G_gradients = optimizer.compute_gradients(G_loss,
                                                              var_list=G_var)
                    tower_G_gradients.append(G_gradients)

    D_loss = tf.reduce_mean(tf.stack(tower_D_losses))
    G_loss = tf.reduce_mean(tf.stack(tower_G_losses))
    tf.summary.scalar("D_loss", D_loss)
    tf.summary.scalar("G_loss", G_loss)
    merged_D_gradients = utils.combine_gradients(tower_D_gradients)
    merged_G_gradients = utils.combine_gradients(tower_G_gradients)

    if clip_gradient_norm > 0:
        with tf.name_scope('clip_grads'):
            merged_D_gradients = utils.clip_gradient_norms(
                merged_D_gradients, clip_gradient_norm)
            merged_G_gradients = utils.clip_gradient_norms(
                merged_G_gradients, clip_gradient_norm)

    # Attach global_step only once so that it will be increased by 1.
    D_train_op = optimizer.apply_gradients(merged_D_gradients)
    G_train_op = optimizer.apply_gradients(merged_G_gradients,
                                           global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("D_loss", D_loss)
    tf.add_to_collection("G_loss", G_loss)
    tf.add_to_collection("p_for_fake", tf.concat(tower_predictions_for_fake,
                                                 0))
    tf.add_to_collection("p_for_data", tf.concat(tower_predictions_for_real,
                                                 0))
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("generated_images",
                         tf.concat(tower_generated_images, 0))
    tf.add_to_collection("D_train_op", D_train_op)
    tf.add_to_collection("G_train_op", G_train_op)
    tf.add_to_collection("noise_input_placeholder", noise_input)
Ejemplo n.º 9
0
def build_graph(reader,
                model,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None):

    global_step = tf.Variable(0, trainable=False, name="global_step")

    local_device_protos = device_lib.list_local_devices()
    gpus = [x.name for x in local_device_protos if x.device_type == 'GPU']
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = '/gpu:%d'
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = '/cpu:%d'

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)
    unused_video_id, model_input_raw, labels_batch, num_frames = (
        get_input_data_tensors(reader,
                               train_data_pattern,
                               batch_size=batch_size * num_towers,
                               num_readers=num_readers,
                               num_epochs=num_epochs))
    tf.summary.histogram("model/input_raw", model_input_raw)

    feature_dim = len(model_input_raw.get_shape()) - 1

    model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)

    tower_inputs = tf.split(model_input, num_towers)
    tower_labels = tf.split(labels_batch, num_towers)
    tower_num_frames = tf.split(num_frames, num_towers)
    tower_gradients = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []
    for i in range(num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        with tf.device(device_string % i):
            with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
                with (slim.arg_scope(
                    [slim.model_variable, slim.variable],
                        device="/cpu:0" if num_gpus != 1 else "/gpu:0")):
                    result = model.create_model(tower_inputs[i],
                                                num_frames=tower_num_frames[i],
                                                vocab_size=reader.num_classes,
                                                labels=tower_labels[i])
                    for variable in slim.get_model_variables():
                        tf.summary.histogram(variable.op.name, variable)

                    predictions = result["predictions"]
                    tower_predictions.append(predictions)

                    if "loss" in result.keys():
                        label_loss = result["loss"]
                    else:
                        label_loss = label_loss_fn.calculate_loss(
                            predictions, tower_labels[i])

                    if "regularization_loss" in result.keys():
                        reg_loss = result["regularization_loss"]
                    else:
                        reg_loss = tf.constant(0.0)

                    reg_losses = tf.losses.get_regularization_losses()
                    if reg_losses:
                        reg_loss += tf.add_n(reg_losses)

                    tower_reg_losses.append(reg_loss)

                    # Adds update_ops (e.g., moving average updates in batch normalization) as
                    # a dependency to the train_op.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if "update_ops" in result.keys():
                        update_ops += result["update_ops"]
                    if update_ops:
                        with tf.control_dependencies(update_ops):
                            barrier = tf.no_op(name="gradient_barrier")
                            with tf.control_dependencies([barrier]):
                                label_loss = tf.identity(label_loss)

                    tower_label_losses.append(label_loss)

                    # Incorporate the L2 weight penalties etc.
                    final_loss = regularization_penalty * reg_loss + label_loss
                    gradients = optimizer.compute_gradients(
                        final_loss, colocate_gradients_with_ops=False)
                    tower_gradients.append(gradients)
    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    tf.summary.scalar("label_loss", label_loss)
    if regularization_penalty != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)
    merged_gradients = utils.combine_gradients(tower_gradients)

    if clip_gradient_norm > 0:
        with tf.name_scope('clip_grads'):
            merged_gradients = utils.clip_gradient_norms(
                merged_gradients, clip_gradient_norm)

    train_op = optimizer.apply_gradients(merged_gradients,
                                         global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
    tf.add_to_collection("train_op", train_op)
Ejemplo n.º 10
0
    def run(self, start_new_model=False):
        """Performs training on the currently defined Tensorflow graph.

    Returns:
      A tuple of the training Hit@1 and the training PERR.
    """
        if self.is_master and start_new_model:
            self.remove_training_directory(self.train_dir)

        target, device_fn = self.start_server_if_distributed()

        meta_filename = []
        for filename in self.train_dir.split(','):
            logging.info("filename:%s", str(filename))
            meta_filename.append(
                self.get_meta_filename(start_new_model, filename))

        label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
        optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train])

        local_device_protos = device_lib.list_local_devices()
        gpus = [x.name for x in local_device_protos if x.device_type == 'GPU']
        num_gpus = len(gpus)

        if num_gpus > 0:
            logging.info("Using the following GPUs to train: " + str(gpus))
            num_towers = num_gpus
            device_string = '/gpu:%d'
        else:
            logging.info("No GPUs found. Training on CPU.")
            num_towers = 1
            device_string = '/cpu:%d'
        # build_graph_retrain(
        #     reader=self.reader,
        #     model=self.model,
        #     train_data_pattern=FLAGS.train_data_pattern,
        #     label_loss_fn=label_loss_fn,
        #     num_readers=FLAGS.num_readers,
        #     batch_size=FLAGS.batch_size)

        # with tf.variable_scope("net2"):

        ####

        global_step = tf.Variable(0, trainable=False, name="global_step")
        learning_rate = tf.train.exponential_decay(
            FLAGS.base_learning_rate,
            global_step * FLAGS.batch_size * num_towers,
            FLAGS.learning_rate_decay_examples,
            FLAGS.learning_rate_decay,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)
        video_id_batch, model_input_raw, labels_batch, num_frames = get_input_data_tensors(
            # pylint: disable=g-line-too-long
            self.reader,
            FLAGS.train_data_pattern,
            batch_size=FLAGS.batch_size,
            num_readers=FLAGS.num_readers)
        tf.summary.histogram("model_input_raw", model_input_raw)

        feature_dim = len(model_input_raw.get_shape()) - 1

        # Normalize input features.
        model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
        # with tf.variable_scope("net1"):
        with tf.variable_scope("tower"):

            result1 = self.model[0].create_model(
                model_input,
                num_frames=num_frames,
                vocab_size=self.reader.num_classes,
                is_training=False)
            #####

            result1 = tf.stop_gradient(result1)
            result2 = self.model[1].create_model(
                model_input,
                num_frames=num_frames,
                vocab_size=self.reader.num_classes,
                labels=labels_batch,
                is_training=False)
            result2 = tf.stop_gradient(result2)
            all_vars = tf.global_variables()
            # for v in all_vars:
            #   print v.name
            # for i in v_vars:
            #   logging.info(str(i))
            for i, v in enumerate(all_vars):
                logging.info(str(v.name))
                if 'rnn' in v.name:
                    vars1 = all_vars[:i]
                    vars2 = all_vars[i:]
                    break
            # v_vars0 = [v for v in all_vars if v.name == 'tower/input_bn/beta:0'
            #           or v.name == 'tower/input_bn/gamma:0'
            #           or v.name == 'tower/input_bn/beta:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0']
            # v_vars = [v for v in all_vars if v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/weights:0'
            #           or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0']

            result1 = tf.nn.l2_normalize(result1, dim=1)
            result2 = tf.nn.l2_normalize(result2, dim=1)
            embeddings = tf.concat([result1, result2], axis=1)
            model_concat = find_class_by_name('MoeModel',
                                              [video_level_models])()
            result = model_concat.create_model(
                embeddings, vocab_size=self.reader.num_classes, num_mixtures=4)
            predictions = result["predictions"]
            # predictions=(result1["predictions"]+result2["predictions"])/2
            tf.summary.histogram("model_activations", predictions)
            # if "loss" in result.keys():
            #   label_loss = result["loss"]
            # else:
            label_loss = label_loss_fn.calculate_loss(predictions,
                                                      labels_batch)
            tf.summary.scalar("label_loss", label_loss)
            if "regularization_loss" in result.keys():
                reg_loss = result["regularization_loss"]
            reg_losses = tf.losses.get_regularization_losses()
            if "regularization_loss" in result.keys():
                reg_loss = result["regularization_loss"]
            else:
                reg_loss = tf.constant(0.0)
            final_loss = FLAGS.regularization_penalty * reg_loss + label_loss

            optimizer = optimizer_class(learning_rate)
            gradients = optimizer.compute_gradients(
                final_loss, colocate_gradients_with_ops=False)

            with tf.name_scope('clip_grads'):
                merged_gradients = utils.clip_gradient_norms(gradients, 1.0)
            train_op = optimizer.apply_gradients(merged_gradients,
                                                 global_step=global_step)

            tf.add_to_collection("global_step", global_step)
            tf.add_to_collection("loss", label_loss)
            tf.add_to_collection("predictions", predictions)
            tf.add_to_collection("input_batch", model_input)
            tf.add_to_collection("video_id_batch", video_id_batch)
            tf.add_to_collection("num_frames", num_frames)
            tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
            tf.add_to_collection("summary_op", tf.summary.merge_all())
            tf.add_to_collection("train_op", train_op)

            video_id_batch = tf.get_collection("video_id_batch")[0]
            prediction_batch = tf.get_collection("predictions")[0]
            label_batch = tf.get_collection("labels")[0]
            loss = tf.get_collection("loss")[0]
            summary_op = tf.get_collection("summary_op")[0]
            # saver = tf.train.Saver(tf.global_variables())
            # saver=tf.train.Saver(result1)
            summary_writer = tf.summary.FileWriter(
                FLAGS.ensemble_dir, graph=tf.get_default_graph())

            config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
            config.gpu_options.allow_growth = True

            with tf.Session(config=config) as sess:
                train_dirs = FLAGS.train_dir.split(',')
                latest_checkpoint0 = tf.train.latest_checkpoint(train_dirs[0])
                latest_checkpoint1 = tf.train.latest_checkpoint(train_dirs[1])
                sess.run(tf.global_variables_initializer())

                if latest_checkpoint0:
                    logging.info("Loading checkpoint for eval: " +
                                 latest_checkpoint0)
                    saver1 = tf.train.Saver(vars1)

                    saver1.restore(sess, latest_checkpoint0)

                if latest_checkpoint1:
                    saver2 = tf.train.Saver(vars2)
                    logging.info("Loading checkpoint for eval: " +
                                 latest_checkpoint1)

                    saver2.restore(sess, latest_checkpoint1)

                saver = tf.train.Saver()
                fetches = [
                    learning_rate, global_step, train_op, video_id_batch,
                    prediction_batch, label_batch, loss, summary_op
                ]

                coord = tf.train.Coordinator()

                threads = []
                for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
                    threads.extend(
                        qr.create_threads(sess,
                                          coord=coord,
                                          daemon=True,
                                          start=True))

                while not coord.should_stop():
                    # batch_start_time = time.time()
                    learning_rate_val, global_step_val, _, vid_val, predictions_val, labels_val, loss_val, summary_val = sess.run(
                        fetches)
                    # hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
                    # perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
                    #                                                           labels_val)
                    # gap = eval_util.calculate_gap(predictions_val, labels_val)
                    # logging.info( "training step " + str(global_step_val)+" | Loss: " + ("%.2f" % loss_val) +" | Hit@1: " +
                    #              ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) +
                    #              " GAP: " + ("%.4f" % gap))

                    if self.is_master and global_step_val % self.disp_batches == 0 and self.train_dir:
                        eval_start_time = time.time()
                        hit_at_one = eval_util.calculate_hit_at_one(
                            predictions_val, labels_val)
                        perr = eval_util.calculate_precision_at_equal_recall_rate(
                            predictions_val, labels_val)
                        gap = eval_util.calculate_gap(predictions_val,
                                                      labels_val)
                        eval_end_time = time.time()
                        eval_time = eval_end_time - eval_start_time
                        logging.info("training step " + str(global_step_val) +
                                     "| learning rate: " +
                                     ("%.4f" % learning_rate_val) +
                                     " | Loss: " + ("%.2f" % loss_val) +
                                     " | Hit@1: " + ("%.4f" % hit_at_one) +
                                     " PERR: " + ("%.4f" % perr) + " GAP: " +
                                     ("%.4f" % gap))
                        summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Hit@1",
                                              hit_at_one), global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/Training_Perr", perr),
                            global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/Training_GAP", gap),
                            global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/loss", loss_val),
                            global_step_val)
                        summary_writer.add_summary(
                            utils.MakeSummary("model/lr", learning_rate_val),
                            global_step_val)
                        summary_writer.flush()
                        if global_step_val % FLAGS.export_model_steps == 0:
                            saver.save(sess,
                                       FLAGS.ensemble_dir,
                                       global_step=global_step_val)

                coord.request_stop()
                coord.join(threads, stop_grace_period_secs=10)
Ejemplo n.º 11
0
def build_graph(reader,
                model,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=1000,
                base_learning_rate=0.01,
                learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                regularization_penalty=1,
                num_readers=1,
                num_epochs=None):
    """Creates the Tensorflow graph.

    This will only be called once in the life of
    a training model, because after the graph is created the model will be
    restored from a meta graph file rather than being recreated.

    Args:
      reader: The data file reader. It should inherit from BaseReader.
      model: The core model (e.g. logistic or neural net). It should inherit from
        BaseModel.
      train_data_pattern: glob path to the training data files.
      label_loss_fn: What kind of loss to apply to the model. It should inherit
        from BaseLoss.
      batch_size: How many examples to process at a time.
      base_learning_rate: What learning rate to initialize the optimizer with.
      optimizer_class: Which optimization algorithm to use.
      clip_gradient_norm: Magnitude of the gradient to clip to.
      regularization_penalty: How much weight to give the regularization loss
        compared to the label loss.
      num_readers: How many threads to use for I/O operations.
      num_epochs: How many passes to make over the data. 'None' means an unlimited
        number of passes.
    """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    local_device_protos = device_lib.list_local_devices()
    gpus = [x.name for x in local_device_protos if x.device_type == "GPU"]
    print(gpus)
    gpus = gpus[:FLAGS.num_gpu]
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = "/gpu:%d"
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = "/cpu:%d"

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * batch_size,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar("learning_rate", learning_rate)

    if clip_gradient_norm > 0.:
        optimizer = optimizer_class(learning_rate)
    else:
        optimizer = optimizer_class(learning_rate)
    input_data_dict = (get_input_data_tensors(reader,
                                              train_data_pattern,
                                              batch_size=batch_size,
                                              num_readers=num_readers,
                                              num_epochs=num_epochs))
    model_input_raw = input_data_dict["video_matrix"]
    labels_batch = input_data_dict["labels"]
    num_frames = input_data_dict["num_frames"]

    print("model_input_shape, ", model_input_raw.shape)
    tf.summary.histogram("model/input_raw", model_input_raw)

    feature_dim = len(model_input_raw.get_shape()) - 1

    offset = np.array([4. / 512] * 1024 + [0] * 128)
    offset = tf.constant(offset, dtype=tf.float32)

    eigen_val = tf.constant(np.sqrt(
        np.load("yt8m_pca/eigenvals.npy")[:1024, 0]),
                            dtype=tf.float32)

    model_input = tf.multiply(
        model_input_raw - offset,
        tf.pad(eigen_val + 1e-4, [[0, 128]], constant_values=1.))

    # model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)

    if FLAGS.segment_labels:
        label_weights = input_data_dict["label_weights"]
    else:
        label_weights = None
    tower_logits = []
    tower_predictions = []
    tower_label_losses = []
    tower_reg_losses = []
    print("flag1!!!!", device_string)
    for i in range(num_towers):
        # For some reason these 'with' statements can't be combined onto the same
        # line. They have to be nested.
        with tf.device(device_string % i):
            with tf.variable_scope("tower_%d" % i, reuse=False):
                result = model.create_model(model_input,
                                            num_frames=num_frames,
                                            vocab_size=reader.num_classes,
                                            labels=labels_batch,
                                            is_training=True)
                for variable in slim.get_model_variables():
                    tf.summary.histogram(variable.op.name, variable)

                predictions = result["predictions"]
                tower_predictions.append(predictions)
                logits = result["logits"]
                tower_logits.append(logits)

                if "loss" in result.keys():
                    label_loss = result["loss"]
                else:
                    label_loss = label_loss_fn.calculate_loss(
                        predictions, labels_batch, label_weights=label_weights)
                    if "aux_predictions" in result.keys():
                        for pred in result["aux_predictions"]:
                            label_loss += label_loss_fn.calculate_loss(
                                pred,
                                labels_batch,
                                label_weights=label_weights)

                if "regularization_loss" in result.keys():
                    reg_loss = result["regularization_loss"]
                else:
                    reg_loss = tf.constant(0.0)

                reg_losses = tf.losses.get_regularization_losses()
                if reg_losses:
                    reg_loss += tf.add_n(reg_losses)

                tower_reg_losses.append(reg_loss)

                # Adds update_ops (e.g., moving average updates in batch normalization) as
                # a dependency to the train_op.
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                if "update_ops" in result.keys():
                    update_ops += result["update_ops"]
                if update_ops:
                    with tf.control_dependencies(update_ops):
                        barrier = tf.no_op(name="gradient_barrier")
                        with tf.control_dependencies([barrier]):
                            label_loss = tf.identity(label_loss)
                tower_label_losses.append(label_loss)

    with tf.device("/gpu:%d" % 0):
        with tf.variable_scope("ensemble"):
            ftr_mean = tf.reduce_mean(model_input, axis=1)
            print("ftr mean shape: ", ftr_mean.get_shape().as_list())
            ftr_mean = slim.batch_norm(ftr_mean,
                                       center=True,
                                       scale=True,
                                       fused=False,
                                       is_training=True,
                                       scope="mix_weights_bn")
            mix_weights = slim.fully_connected(
                ftr_mean,
                num_towers,
                activation_fn=None,
                weights_initializer=slim.variance_scaling_initializer(),
                scope="mix_weights")
            mix_weights = tf.nn.softmax(mix_weights, axis=-1)
            tf.summary.histogram("mix_weights", mix_weights)
            logits = tf.stack(tower_logits, axis=1)
            final_logit = tf.reduce_sum(tf.multiply(
                logits, tf.expand_dims(mix_weights, axis=-1)),
                                        axis=1,
                                        keepdims=False)
            final_predictions = tf.nn.sigmoid(final_logit)
        print("flag2!!!", FLAGS.final_temperature, FLAGS.final_lambda)
        rank_pred = tf.expand_dims(tf.nn.softmax(tf.div(
            final_logit, FLAGS.final_temperature),
                                                 axis=-1),
                                   axis=1)
        aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.final_temperature),
                                       axis=-1)
        epsilon = 1e-8
        kl_loss = tf.reduce_sum(
            rank_pred *
            (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)),
            axis=-1)
        regularization_loss = FLAGS.final_lambda * tf.reduce_mean(
            tf.reduce_sum(kl_loss, axis=-1), axis=-1)

        final_label_loss = label_loss_fn.calculate_loss(
            final_predictions, labels_batch, label_weights=label_weights)

        label_loss = tf.reduce_sum(
            tf.stack(tower_label_losses)) + final_label_loss
        tf.summary.scalar("label_loss", label_loss)
        reg_loss = tf.reduce_sum(
            tf.stack(tower_reg_losses)) + regularization_loss
        tf.summary.scalar("reg_loss", reg_loss)
        final_loss = label_loss + regularization_penalty * reg_loss

    gradients = optimizer.compute_gradients(final_loss,
                                            colocate_gradients_with_ops=True)
    if clip_gradient_norm > 0:
        gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm)
    final_train_op = optimizer.apply_gradients(gradients,
                                               global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", final_predictions)
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
    tf.add_to_collection("train_op", final_train_op)
Ejemplo n.º 12
0
def main():
    env = json.loads(os.environ.get("TF_CONFIG", "{}"))

    task_data = env.get("task", None) or {"type": "master", "index": 0}
    task = type("TaskSpec", (object, ), task_data)

    logging.set_verbosity(tf.logging.INFO)
    logging.info("%s: Tensorflow version: %s.", task_as_string(task),
                 tf.__version__)

    video_ids, video_features, video_labels, video_frames = gen_input(
        data_pattern,
        reader_batch_size=reader_batch_size,
        num_classes=num_classes,
        num_readers=num_readers,
        mini_batch_size=mini_batch_size)

    result = gen_model(model_input=video_features,
                       vocab_size=num_classes,
                       labels=video_labels,
                       num_frames=video_frames)

    predictions = result["predictions"]

    global_step = tf.Variable(0, trainable=False, name="global_step")

    label_loss = label_loss_fn.calculate_loss(predictions, video_labels)

    if "regularization_loss" in result.keys():
        reg_loss = result["regularization_loss"]
    else:
        reg_loss = tf.constant(0.0)

    reg_losses = tf.losses.get_regularization_losses()
    if reg_losses:
        reg_loss += tf.add_n(reg_losses)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if "update_ops" in result.keys():
        update_ops += result["update_ops"]

    if update_ops:
        with tf.control_dependencies(update_ops):
            barrier = tf.no_op(name="gradient_barrier")
            with tf.control_dependencies([barrier]):
                label_loss = tf.identity(label_loss)

    final_loss = regularization_penalty * reg_loss + label_loss

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * mini_batch_size *
                                               num_towers,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)

    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)

    gradients = optimizer.compute_gradients(final_loss,
                                            colocate_gradients_with_ops=False)

    tf.summary.scalar("label_loss", label_loss)

    tf.summary.scalar("reg_loss", reg_loss)

    if clip_gradient_norm > 0:
        with tf.name_scope('clip_grads'):
            gradients = utils.clip_gradient_norms(gradients,
                                                  clip_gradient_norm)

    train_op = optimizer.apply_gradients(gradients, global_step=global_step)

    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()

        #init_local_op = tf.local_variables_initializer()
        #sess.run(init_local_op)

        coord = tf.train.Coordinator()

        threads = tf.train.start_queue_runners(coord=coord)

        total_step = 0

        try:
            while total_step < 100000:
                batch_start_time = time.time()

                # v_ids, v_features, v_labels, v_frames = sess.run([video_ids, video_features, video_labels, video_frames])

                _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                    [
                        train_op, global_step, label_loss, predictions,
                        tf.cast(video_labels, tf.float32)
                    ])

                seconds_per_batch = time.time() - batch_start_time
                examples_per_second = labels_val.shape[0] / seconds_per_batch

                # if max_steps <= global_step_val:
                #    max_steps_reached = True
                # print(v_features.shape)
                # print(v_ids)

                if total_step % 10 == 0:
                    eval_start_time = time.time()
                    hit_at_one = eval_util.calculate_hit_at_one(
                        predictions_val, labels_val)
                    perr = eval_util.calculate_precision_at_equal_recall_rate(
                        predictions_val, labels_val)
                    gap = eval_util.calculate_gap(predictions_val, labels_val)
                    eval_end_time = time.time()
                    eval_time = eval_end_time - eval_start_time

                    logging.info("training step " + str(global_step_val) +
                                 " | Loss: " + ("%.2f" % loss_val) +
                                 " Examples/sec: " +
                                 ("%.2f" % examples_per_second) +
                                 " | Hit@1: " + ("%.2f" % hit_at_one) +
                                 " PERR: " + ("%.2f" % perr) + " GAP: " +
                                 ("%.2f" % gap))

                else:
                    logging.info("training step " + str(global_step_val) +
                                 " | Loss: " + ("%.2f" % loss_val) +
                                 " Examples/sec: " +
                                 ("%.2f" % examples_per_second))

                total_step = total_step + 1

        except tf.errors.OutOfRangeError:
            logging.info("%s: Done training -- epoch limit reached.",
                         task_as_string(task))

        coord.request_stop()

        coord.join(threads)
Ejemplo n.º 13
0
def build_graph(reader,
                model,
                train_data_list,
                train_data_pattern,
                label_loss_fn=losses.CrossEntropyLoss(),
                batch_size=16,
                base_learning_rate=0.01,
                learning_rate_decay_examples=4000,
                learning_rate_decay=0.99,
                optimizer_class=tf.train.AdamOptimizer,
                clip_gradient_norm=1.0,
                prediction_threshold=0.5,
                regularization_penalty=1,
                num_readers=2,
                num_epochs=None):
    """Creates the Tensorflow graph.

  This will only be called once in the life of
  a training model, because after the graph is created the model will be
  restored from a meta graph file rather than being recreated.

  Args:
    reader: The data file reader. It should inherit from BaseReader.
    model: The core model (e.g. logistic or neural net). It should inherit
           from BaseModel.
    train_data_pattern: glob path to the training data files.
    label_loss_fn: What kind of loss to apply to the model. It should inherit
                from BaseLoss.
    batch_size: How many examples to process at a time.
    base_learning_rate: What learning rate to initialize the optimizer with.
    optimizer_class: Which optimization algorithm to use.
    clip_gradient_norm: Magnitude of the gradient to clip to.
    regularization_penalty: How much weight to give the regularization loss
                            compared to the label loss.
    num_readers: How many threads to use for I/O operations.
    num_epochs: How many passes to make over the data. 'None' means an
                unlimited number of passes.
  """

    global_step = tf.Variable(0, trainable=False, name="global_step")

    if FLAGS.accumulate_gradients:
        actual_batch_size = batch_size * FLAGS.apply_every_n_batches
    else:
        actual_batch_size = batch_size

    learning_rate = tf.train.exponential_decay(base_learning_rate,
                                               global_step * actual_batch_size,
                                               learning_rate_decay_examples,
                                               learning_rate_decay,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    optimizer = optimizer_class(learning_rate)

    image_id, image_data, image_mask = (get_input_data_tensors(
        reader,
        train_data_list,
        train_data_pattern,
        batch_size=batch_size,
        num_readers=num_readers,
        num_epochs=num_epochs))

    model_input = image_data
    tf.summary.histogram("model/input", model_input)

    with tf.name_scope("model"):
        result = model.create_model(model_input, l2_penalty=FLAGS.l2_penalty)
        print "result", result

        for variable in slim.get_model_variables():
            tf.summary.histogram(variable.op.name, variable)

        predictions = result["predictions"]
        if "loss" in result.keys():
            label_loss = result["loss"]
        else:
            if FLAGS.multitask:
                support_predictions = result["support_predictions"]
                tf.summary.histogram("model/support_predictions",
                                     support_predictions)
                print "support_predictions", support_predictions
                label_loss = label_loss_fn.calculate_loss(
                    predictions, support_predictions, image_mask)
            else:
                label_loss = label_loss_fn.calculate_loss(
                    predictions, image_mask)

        tf.summary.histogram("model/predictions", predictions)
        tf.summary.scalar("label_loss", label_loss)

        if "regularization_loss" in result.keys():
            reg_loss = result["regularization_loss"]
        else:
            reg_loss = tf.constant(0.0)

        reg_losses = tf.losses.get_regularization_losses()
        if reg_losses:
            reg_loss += tf.add_n(reg_losses)

        if regularization_penalty != 0:
            tf.summary.scalar("reg_loss", reg_loss)

        # Adds update_ops (e.g., moving average updates in batch normalization) as
        # a dependency to the train_op.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if "update_ops" in result.keys():
            update_ops += result["update_ops"]
        if update_ops:
            with tf.control_dependencies(update_ops):
                barrier = tf.no_op(name="gradient_barrier")
                with tf.control_dependencies([barrier]):
                    label_loss = tf.identity(label_loss)

        # Incorporate the L2 weight penalties etc.
        final_loss = regularization_penalty * reg_loss + label_loss

        # Accumulate several batches before gradient descent options
        # to make larger batch than the memory could be able to hold
        if FLAGS.accumulate_gradients:
            assert FLAGS.apply_every_n_batches > 0, "apply_every_n_batches should be > 0"
            scale = 1.0 / FLAGS.apply_every_n_batches

            tvs = tf.trainable_variables()
            accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in tvs
            ]
            init_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]
            gvs = optimizer.compute_gradients(final_loss, tvs)
            accum_ops = [
                accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs)
            ]

            if clip_gradient_norm > 0:
                with tf.name_scope('clip_grads'):
                    clipped_accum_vars = utils.clip_variable_norms(
                        accum_vars, max_norm=clip_gradient_norm, scale=scale)
                    apply_op = optimizer.apply_gradients(
                        [(clipped_accum_vars[i], gv[1])
                         for i, gv in enumerate(gvs)],
                        global_step=global_step)

            else:
                apply_op = optimizer.apply_gradients(
                    [(accum_vars[i] * scale, gv[1])
                     for i, gv in enumerate(gvs)],
                    global_step=global_step)
            tf.get_collection_ref("train/init_ops").extend(init_ops)
            tf.get_collection_ref("train/accum_ops").extend(accum_ops)
            tf.add_to_collection("train/apply_op", apply_op)

        # the original way, apply every batch
        else:
            gradients = optimizer.compute_gradients(
                final_loss, colocate_gradients_with_ops=False)
            if clip_gradient_norm > 0:
                with tf.name_scope('clip_grads'):
                    gradients = utils.clip_gradient_norms(
                        gradients, clip_gradient_norm)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
            tf.add_to_collection("train/train_op", train_op)

        labels = tf.cast(image_mask, tf.int32)
        float_labels = tf.cast(image_mask, tf.float32)

        auc, _ = tf.metrics.auc(labels, predictions, num_thresholds=40)

        bool_predictions = tf.greater(predictions, prediction_threshold)
        true_pos = tf.cast(
            tf.reduce_sum(
                tf.cast(labels > 0, tf.int32) *
                tf.cast(predictions > prediction_threshold, tf.int32)),
            tf.float32)
        false_pos = tf.cast(
            tf.reduce_sum(
                tf.cast(labels <= 0, tf.int32) *
                tf.cast(predictions > prediction_threshold, tf.int32)),
            tf.float32)
        false_neg = tf.cast(
            tf.reduce_sum(
                tf.cast(labels > 0, tf.int32) *
                tf.cast(predictions <= prediction_threshold, tf.int32)),
            tf.float32)
        mean_iou = (2.0 * true_pos + 1e-7) / (2 * true_pos + false_pos +
                                              false_neg + 1e-7)
        print mean_iou

        num_examples = tf.shape(labels)[0]

        tf.add_to_collection("global_step", global_step)
        tf.add_to_collection("loss", label_loss)
        tf.add_to_collection("id_batch", image_id)
        tf.add_to_collection("predictions", predictions)
        tf.add_to_collection("model_input", model_input)
        tf.add_to_collection("num_examples", num_examples)
        tf.add_to_collection("labels", labels)
        tf.add_to_collection("float_labels", float_labels)
        tf.add_to_collection("bool_predictions", bool_predictions)
        tf.add_to_collection("auc", auc)
        tf.add_to_collection("mean_iou", mean_iou)