Beispiel #1
0
def image_embedding(images,
                    model_fn=resnet_v1_152,
                    trainable=True,
                    is_training=True,
                    weight_decay=0.0001,
                    batch_norm_decay=0.997,
                    batch_norm_epsilon=1e-5,
                    batch_norm_scale=True,
                    add_summaries=False,
                    reuse=False):
  """Extract image features from pretrained resnet model."""

  is_resnet_training = trainable and is_training

  batch_norm_params = {
      "is_training": is_resnet_training,
      "trainable": trainable,
      "decay": batch_norm_decay,
      "epsilon": batch_norm_epsilon,
      "scale": batch_norm_scale,
  }

  if trainable:
    weights_regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
  else:
    weights_regularizer = None

  with tf.variable_scope(model_fn.__name__, [images], reuse=reuse) as scope:
    with slim.arg_scope(
        [slim.conv2d],
        weights_regularizer=weights_regularizer,
        trainable=trainable):
      with slim.arg_scope(
          [slim.conv2d],
          weights_initializer=slim.variance_scaling_initializer(),
          activation_fn=tf.nn.relu,
          normalizer_fn=slim.batch_norm,
          normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm],
                            is_training=is_resnet_training,
                            trainable=trainable):
          with slim.arg_scope([slim.max_pool2d], padding="SAME"):
            net, end_points = model_fn(
                images, num_classes=None, global_pool=False,
                is_training=is_resnet_training,
                reuse=reuse, scope=scope)

  if add_summaries:
    for v in end_points.values():
      tf.contrib.layers.summaries.summarize_activation(v)

  return net
Beispiel #2
0
def resnet_arg_scope(weight_decay=0.0001,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
  """Defines the default ResNet arg scope.

  TODO(gpapan): The batch-normalization related default values above are
    appropriate for use in conjunction with the reference ResNet models
    released at https://github.com/KaimingHe/deep-residual-networks. When
    training ResNets from scratch, they might need to be tuned.

  Args:
    weight_decay: The weight decay to use for regularizing the model.
    batch_norm_decay: The moving average decay when estimating layer activation
      statistics in batch normalization.
    batch_norm_epsilon: Small constant to prevent division by zero when
      normalizing activations by their variance in batch normalization.
    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
      activations in the batch normalization layer.

  Returns:
    An `arg_scope` to use for the resnet models.
  """
  batch_norm_params = {
      'decay': batch_norm_decay,
      'epsilon': batch_norm_epsilon,
      'scale': batch_norm_scale,
      'updates_collections': tf.GraphKeys.UPDATE_OPS,
  }

  with slim.arg_scope(
      [slim.conv2d],
      weights_regularizer=slim.l2_regularizer(weight_decay),
      weights_initializer=slim.variance_scaling_initializer(),
      activation_fn=tf.nn.relu,
      normalizer_fn=slim.batch_norm,
      normalizer_params=batch_norm_params):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
      # The following implies padding='SAME' for pool1, which makes feature
      # alignment easier for dense prediction tasks. This is also used in
      # https://github.com/facebook/fb.resnet.torch. However the accompanying
      # code of 'Deep Residual Learning for Image Recognition' uses
      # padding='VALID' for pool1. You can switch to that choice by setting
      # slim.arg_scope([slim.max_pool2d], padding='VALID').
      with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
        return arg_sc
def resnet_arg_scope(is_training=True,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
  batch_norm_params = {
    'is_training': False,
    'decay': batch_norm_decay,
    'epsilon': batch_norm_epsilon,
    'scale': batch_norm_scale,
    'trainable': False,
    'updates_collections': tf.GraphKeys.UPDATE_OPS
  }

  with arg_scope(
      [slim.conv2d],
      weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
      weights_initializer=slim.variance_scaling_initializer(),
      trainable=is_training,
      activation_fn=tf.nn.relu,
      normalizer_fn=slim.batch_norm,
      normalizer_params=batch_norm_params):
    with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
      return arg_sc
Beispiel #4
0
  def _create_baseline(self, n_output=1, n_hidden=100,
                       is_zero_init=False,
                       collection='BASELINE'):
    # center input
    h = self._x
    if self.mean_xs is not None:
      h -= self.mean_xs

    if is_zero_init:
      initializer = init_ops.zeros_initializer()
    else:
      initializer = slim.variance_scaling_initializer()

    with slim.arg_scope([slim.fully_connected],
                        variables_collections=[collection, Q_COLLECTION],
                        trainable=False,
                        weights_initializer=initializer):
      h = slim.fully_connected(h, n_hidden, activation_fn=tf.nn.tanh)
      baseline = slim.fully_connected(h, n_output, activation_fn=None)

      if n_output == 1:
        baseline = tf.reshape(baseline, [-1])  # very important to reshape
    return baseline
def _extra_conv_arg_scope_with_bn(weight_decay=0.00001,
                     activation_fn=None,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):

  batch_norm_params = {
      'decay': batch_norm_decay,
      'epsilon': batch_norm_epsilon,
      'scale': batch_norm_scale,
      'updates_collections': tf.GraphKeys.UPDATE_OPS_EXTRA,
  }

  with slim.arg_scope(
      [slim.conv2d],
      weights_regularizer=slim.l2_regularizer(weight_decay),
      weights_initializer=slim.variance_scaling_initializer(),
      activation_fn=tf.nn.relu,
      normalizer_fn=slim.batch_norm,
      normalizer_params=batch_norm_params):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
      with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
        return arg_sc
Beispiel #6
0
def resnet_arg_scope(is_training=True,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
    batch_norm_params = {
        'is_training': False,
        'decay': batch_norm_decay,
        'epsilon': batch_norm_epsilon,
        'scale': batch_norm_scale,
        'trainable': False,
        'updates_collections': ops.GraphKeys.UPDATE_OPS
    }
    with arg_scope(
            [slim.conv2d, slim.fully_connected],
            weights_regularizer=tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
            weights_initializer=slim.variance_scaling_initializer(),
            biases_regularizer=tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
            biases_initializer=tf.constant_initializer(0.0),
            trainable=is_training,
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
            return arg_sc
Beispiel #7
0
def resnet_arg_scope_bn_trainable(is_training=True,
                                  batch_norm_decay=0.997,
                                  batch_norm_epsilon=1e-5,
                                  batch_norm_scale=True):
    batch_norm_params = {
        'is_training':
        True,  # Should be always True, otherwise it would have very weird outputs
        'decay': batch_norm_decay,
        'epsilon': batch_norm_epsilon,
        'scale': batch_norm_scale,
        'trainable': True,
        'updates_collections': tf.GraphKeys.UPDATE_OPS
    }

    with arg_scope([slim.conv2d],
                   weights_regularizer=slim.l2_regularizer(
                       cfg.TRAIN.WEIGHT_DECAY),
                   weights_initializer=slim.variance_scaling_initializer(),
                   trainable=is_training,
                   activation_fn=tf.nn.relu,
                   normalizer_fn=slim.batch_norm,
                   normalizer_params=batch_norm_params):
        with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
            return arg_sc
Beispiel #8
0
 def resnet_arg_scope(self, is_training=True):
     '''
     In Default, do not use BN to train resnet, since batch_size is too small.
     So is_training is False and trainable is False in the batch_norm params.
     '''
     batch_norm_params = {
         'is_training': False,
         'decay': 0.997,
         'epsilon': 1e-5,
         'scale': True,
         'trainable': False,
         'updates_collections': tf.GraphKeys.UPDATE_OPS
     }
     with slim.arg_scope(
         [slim.conv2d],
             weights_regularizer=slim.l2_regularizer(self.weight_decay),
             weights_initializer=slim.variance_scaling_initializer(),
             trainable=is_training,
             activation_fn=tf.nn.relu,
             normalizer_fn=slim.batch_norm,
             normalizer_params=batch_norm_params):
         with slim.arg_scope([slim.batch_norm],
                             **batch_norm_params) as arg_sc:
             return arg_sc
Beispiel #9
0
def resnet_arg_scope(is_training=True,
                     weight_decay=cfg.TRAIN.WEIGHT_DECAY,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
  batch_norm_params = {
    'is_training': cfg.TRAIN.BN_TRAIN and is_training,
    'decay': batch_norm_decay,
    'epsilon': batch_norm_epsilon,
    'scale': batch_norm_scale,
    'trainable': cfg.TRAIN.BN_TRAIN,
    'updates_collections': tf.GraphKeys.UPDATE_OPS
  }

  with arg_scope(
      [slim.conv2d],
      weights_regularizer=slim.l2_regularizer(weight_decay),
      weights_initializer=slim.variance_scaling_initializer(),
      trainable=is_training,
      activation_fn=tf.nn.relu,
      normalizer_fn=slim.batch_norm,
      normalizer_params=batch_norm_params):
    with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
      return arg_sc
Beispiel #10
0
    def build_fastrcnn(self, feature_to_cropped, rois, img_shape, scope):

        with tf.variable_scope('Fast-RCNN_{}'.format(scope)):
            # 5. ROI Pooling
            with tf.variable_scope('rois_pooling'):
                pooled_features = self.roi_pooling(
                    feature_maps=feature_to_cropped,
                    rois=rois,
                    img_shape=img_shape)

            # 6. inferecne rois in Fast-RCNN to obtain fc_flatten features
            if self.base_network_name.startswith('resnet'):
                fc_flatten = resnet.restnet_head(
                    input=pooled_features,
                    is_training=self.is_training,
                    scope_name=self.base_network_name,
                    stage=scope)

            else:
                raise NotImplementedError('only support resnet and mobilenet')

            # 7. cls and reg in Fast-RCNN
            # tf.variance_scaling_initializer()
            # tf.VarianceScaling()
            with slim.arg_scope([slim.fully_connected],
                                weights_regularizer=slim.l2_regularizer(
                                    cfgs.WEIGHT_DECAY)):
                if not scope == 'stage3':

                    cls_score = slim.fully_connected(
                        fc_flatten,
                        num_outputs=cfgs.CLASS_NUM + 1,
                        weights_initializer=slim.variance_scaling_initializer(
                            factor=1.0, mode='FAN_AVG', uniform=True),
                        activation_fn=None,
                        trainable=self.is_training,
                        scope='cls_fc_h')

                    bbox_pred = slim.fully_connected(
                        fc_flatten,
                        num_outputs=(cfgs.CLASS_NUM + 1) * 5,
                        weights_initializer=slim.variance_scaling_initializer(
                            factor=1.0, mode='FAN_AVG', uniform=True),
                        activation_fn=None,
                        trainable=self.is_training,
                        scope='reg_fc_h')

                    # for convient. It also produce (cls_num +1) bboxes
                    cls_score = tf.reshape(cls_score, [-1, cfgs.CLASS_NUM + 1])
                    bbox_pred = tf.reshape(bbox_pred,
                                           [-1, 5 * (cfgs.CLASS_NUM + 1)])
                    bbox_pred_ins = tf.reshape(bbox_pred,
                                               [-1, cfgs.CLASS_NUM + 1, 5])

                    # only keep a box which score is the bigest
                    keep_abox = tf.argmax(cls_score, axis=1)
                    keep_inds = tf.reshape(
                        tf.transpose(
                            tf.stack([
                                tf.cumsum(tf.ones_like(keep_abox)) - 1,
                                keep_abox
                            ])), [-1, 2])
                    bbox_pred_fliter = tf.reshape(
                        tf.gather_nd(bbox_pred_ins, keep_inds), [-1, 5])

                    return bbox_pred_fliter, bbox_pred, cls_score
                else:
                    cls_score = slim.fully_connected(
                        fc_flatten,
                        num_outputs=cfgs.CLASS_NUM + 1,
                        weights_initializer=slim.variance_scaling_initializer(
                            factor=1.0, mode='FAN_AVG', uniform=True),
                        activation_fn=None,
                        trainable=self.is_training,
                        scope='cls_fc_r')

                    bbox_pred = slim.fully_connected(
                        fc_flatten,
                        num_outputs=(cfgs.CLASS_NUM + 1) * 5,
                        weights_initializer=slim.variance_scaling_initializer(
                            factor=1.0, mode='FAN_AVG', uniform=True),
                        activation_fn=None,
                        trainable=self.is_training,
                        scope='reg_fc_r')
                    cls_score = tf.reshape(cls_score, [-1, cfgs.CLASS_NUM + 1])
                    bbox_pred = tf.reshape(bbox_pred,
                                           [-1, 5 * (cfgs.CLASS_NUM + 1)])
                    return bbox_pred, cls_score
Beispiel #11
0
  def model_fn(self, is_training=True, *args, **kwargs):
    # write your own model code
    
    # for tensorflow
    # step 1: unwarp data
    batch_data = None
    batch_label = None
    if len(args) > 0:
      # for method 2
      # on train or test stage, unwarp data from args (which comes from model_input())
      if is_training:
        batch_data, batch_label = args[0].dequeue()
      else:
        batch_data = args[0]
    else:
      # for method 1
      # use placeholder
      batch_data = tf.placeholder(tf.uint8,
                                  shape=[ctx.params.batch_size, ctx.params.input_size, ctx.params.input_size, 1],
                                  name='data_node')

      if not is_training:
        batch_label = tf.placeholder(tf.int32,
                                     shape=[ctx.params.batch_size],
                                     name='label_node')

    # 转换数据类型
    batch_data = tf.cast(batch_data, tf.float32)

    # step 2: building model
    # 实现LeNet-5卷积神经网络
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        weights_regularizer=slim.l2_regularizer(0.0001),
                        normalizer_fn=None,
                        weights_initializer=slim.variance_scaling_initializer()):

        # 卷积层:输入Tensor大小: batch x 28 x 28 x 1; 输出Tensor大小: batch x 24 x 24 x 6
        conv_1 = slim.conv2d(batch_data, 6, [5, 5], stride=1, activation_fn=tf.nn.relu, padding='VALID')
        print(conv_1)

        # 池化层:输入Tensor大小:batch x 24 x 24 x 6;输出Tensor大小:batch x 12 x 12 x 6
        pool_1 = slim.max_pool2d(conv_1, [2, 2], stride=2, padding='VALID')

        # 卷积层:输入Tensor大小:batch x 12 x 12 x 16;输出Tensor大小:batch x 8 x 8 x 16
        conv_2 = slim.conv2d(pool_1, 16, [5, 5], stride=1, activation_fn=tf.nn.relu, padding='VALID')
        print(conv_2)

        # 池化层:输入Tensor大小:batch x 8 x 8 x 16;输出Tensor大小:batch x 4 x 4 x 16
        pool_2 = slim.max_pool2d(conv_2, [2, 2], stride=2, padding='VALID')

        # 展开成一维Tensor,输入Tensor大小:batch x 4 x 4 x 16,输出Tensor大小:batch x 256
        fc_1 = tf.contrib.layers.flatten(pool_2)

        # 全连接层:输入Tensor大小:batch x 256;输出Tensor大小:batch x 120
        fc_1 = slim.fully_connected(fc_1, 120)

        # Relu激活层
        fc_1 = tf.nn.relu(fc_1)

        # 全连接层:输入Tensor大小:batch x 120;输出Tensor大小:batch x 84
        fc_2 = slim.fully_connected(fc_1, 84)
        # Relu激活层
        fc_2 = tf.nn.relu(fc_2)

        # 全连接层:输入Tensor大小:batch x 84;输出Tensor大小:batch x 10 (MIMIST 数据集总共10个类别)
        logits = slim.fully_connected(fc_2, 10)

        # step 3: output
        if is_training:
            # use logits to compute loss
            # 使用Logits计算交叉熵损失
            batch_label_one_hot = slim.one_hot_encoding(batch_label, 10)
            loss = tf.losses.softmax_cross_entropy(batch_label_one_hot, logits)
            return loss
        else:
            # use logits to compute model predict
            # 使用Logits计算分类概率
            predict = tf.nn.softmax(logits)
            return predict
Beispiel #12
0
def main():
    args = parser.parse_args()

    # We store all arguments in a json file. This has two advantages:
    # 1. We can always get back and see what exactly that experiment was
    # 2. We can resume an experiment as-is without needing to remember flags.
    if args.resume or args.auto_resume:
        args.experiment_root = utils.select_existing_root(args.experiment_root)
        args_file = os.path.join(args.experiment_root, 'args.json')
        if not os.path.isfile(args_file) and not args.auto_resume:
            # We are not auto_resuming and no existing file was found. This is
            # an error.
            raise IOError('`args.json` not found in {}'.format(args_file))
        elif not os.path.isfile(args_file) and args.auto_resume:
            # No existing args file was found, but we are auto resuming, so we
            # just start a new run.
            new_run = True
        else:
            # We found an existing args file, this can just be used.
            new_run = False
            print('Loading args from {}.'.format(args_file))
            with open(args_file, 'r') as f:
                args_resumed = json.load(f)
            args_resumed['resume'] = True  # This would be overwritten.

            # When resuming, we not only want to populate the args object with
            # the values from the file, but we also want to check for some
            # possible conflicts between loaded and given arguments.
            for key, value in args.__dict__.items():
                if key in args_resumed:
                    resumed_value = args_resumed[key]
                    if resumed_value != value:
                        print('Warning: For the argument `{}` we are using the'
                              ' loaded value `{}`. The provided value was `{}`'
                              '.'.format(key, resumed_value, value))
                        args.__dict__[key] = resumed_value
                else:
                    print('Warning: A new argument was added since the last run'
                          ': `{}`. Using the new value: `{}`.'
                          ''.format(key, value))
    else:
        # No resuming requested at all.
        new_run = True

    if new_run:
        # If the experiment directory exists already and we are not auto
        # resuming, we bail in fear.
        args.experiment_root = utils.select_existing_root(
                args.experiment_root, check_only_basedir=True)
        if os.path.exists(args.experiment_root) and not args.auto_resume:
            if os.listdir(args.experiment_root):
                print('The directory {} already exists and is not empty.'
                      ' If you want to resume training, append --resume or '
                      ' --auto_resume to your call.'
                      ''.format(args.experiment_root))
                exit(1)
        elif os.path.exists(args.experiment_root) and args.auto_resume:
            # If we are auto resuming, it is okay if the directory exists.
            pass
        else:
            # We create a new one if it does not exist.
            os.makedirs(args.experiment_root)
        args_file = os.path.join(args.experiment_root, 'args.json')


        # Make sure the required arguments are provided:
        # train_set, dataset_root, dataset_config
        if not args.train_set:
            parser.print_help()
            print('You did not specify the `train_set` argument!')
            exit(1)
        if not args.dataset_root:
            parser.print_help()
            print('You did not specify the required `dataset_root` argument!')
            exit(1)
        if not args.dataset_config:
            parser.print_help()
            print('You did not specify the required `dataset_config` argument!')
            exit(1)

        # Since multiple datasets can be used, we need to check that the
        # we got lists of the same length
        train_set_len = len(args.train_set)
        dataset_root_len = len(args.dataset_config)
        dataset_config_len = len(args.dataset_config)
        if args.dataset_weights is not None:
            dataset_weight_len = len(args.dataset_weights)
        else:
            # We'll set this manually later so just use a valid length here.
            dataset_weight_len = dataset_config_len

        if (train_set_len != dataset_root_len or
                train_set_len != dataset_config_len or
                train_set_len != dataset_weight_len):
            parser.print_help()
            print('The dataset specific argument lengths didn\'t match.')
            exit(1)


        # Parse the model parameters. This could be a bit cleaner in the future,
        # but it will do for now.
        if args.model_params is not None:
            #model_params = args.model_params.split(';')
            #if len(model_params) % 2 != 0:
            #    raise ValueError('`model_params` has to be a comma separated '
            #                     'list of even length.')
            #it = iter(model_params)
            #args.model_params = {p: eval(v) for p, v in zip(it,it)}
            args.model_params = eval(args.model_params)
        else:
            args.model_params = {}

        # Check some parameter clashes.
        if args.crop_augment > 0 and (args.fixed_crop_augment_width > 0 or
                                      args.fixed_crop_augment_height > 0):
            print('You cannot specified the use of both types of crop '
                  'augmentations. Either use the `crop_augment` argument to '
                  'remove a fixed amount of pixel from the borders, or use the '
                  '`fixed_crop_augment_height` arguments to provide a fixed '
                  'size window that will be cropped from the input images.')
            exit(1)
        if ((args.fixed_crop_augment_height > 0) !=
                (args.fixed_crop_augment_width > 0)):
            print('You need to specify both the `fixed_crop_augment_width` and '
                  '`fixed_crop_augment_height` arguments for a valid '
                  'augmentation.')
            exit(1)

        # Store the passed arguments for later resuming and grepping in a nice
        # and readable format.
        with open(args_file, 'w') as f:
            # Make sure not to store the auto_resume forever though.
            if 'auto_resume' in args.__dict__:
                del args.__dict__['auto_resume']
            json.dump(
                vars(args), f, ensure_ascii=False, indent=2, sort_keys=True)

    log_file = os.path.join(args.experiment_root, 'train')
    logging.config.dictConfig(utils.get_logging_dict(log_file))
    log = logging.getLogger('train')

    # Also show all parameter values at the start, for ease of reading logs.
    log.info('Training using the following parameters:')
    for key, value in sorted(vars(args).items()):
        log.info('{}: {}'.format(key, value))


    # Preload all the filenames and mappings.
    file_lists = []
    dataset_configs = []
    for i, (train_set, dataset_root, config) in enumerate(
            zip(args.train_set, args.dataset_root, args.dataset_config)):

        # Load the config for the dataset.
        with open(config, 'r') as f:
            dataset_configs.append(json.load(f))
        log.info('Training set {} based on a `{}` configuration.'.format(
            i, dataset_configs[-1]['dataset_name']))

        # Load the data from the CSV file.
        file_list = utils.load_dataset(train_set, dataset_root)
        file_lists.append(file_list)

    # if not None set based on size
    if args.dataset_weights is None:
        dataset_weights = [len(fl) for fl in file_lists]
    else:
        dataset_weights = args.dataset_weights

    # In order to keep the loading of images in tensorflow, we need to make some
    # quite ugly hacks where we merge all the dataset original to train mappings
    # into one tensor. Not nice but working.
    mappings = [d.get('original_to_train_mapping') for d in dataset_configs]
    mapping = np.zeros(
        (len(mappings), np.max([len(m) for m in mappings])), dtype=np.int32)
    for i, m in enumerate(mappings):
        mapping[i, :len(m)] = m
    original_to_train_mapping = tf.constant(mapping)

    dataset = tf.data.Dataset.from_generator(
        generator=functools.partial(
            utils.mixed_dataset_generator, file_lists, dataset_weights
        ),
        output_types=(tf.string, tf.string, tf.int32))

    # Convert filenames to actual image and label id tensors.
    dataset = dataset.map(
        lambda x, y, z: tf_utils.string_tuple_to_image_pair(
            x, y, tf.gather(original_to_train_mapping, z)) + (z,),
        num_parallel_calls=args.loading_threads)

    # Possible augmentations
    if args.flip_augment:
        dataset = dataset.map(
            lambda x, y, z: tf_utils.flip_augment(x, y) + (z,))
    if args.gamma_augment:
        dataset = dataset.map(
            lambda x, y, z: tf_utils.gamma_augment(x, y) + (z,))

    # TODO deprecate this. It doesn't file with many datasets. This needs to go.
    if args.crop_augment > 0:
        dataset = dataset.map(
            lambda x, y, z: tf_utils.crop_augment(
                x, y, args.crop_augment, args.crop_augment) + (z,))
    # TODO end

    if args.fixed_crop_augment_width > 0 and args.fixed_crop_augment_height > 0:
        dataset = dataset.map(
            lambda x, y, z: tf_utils.fixed_crop_augment(
                x, y, args.fixed_crop_augment_height,
                args.fixed_crop_augment_width) + (z,))

    # Re scale the input images
    dataset = dataset.map(lambda x, y, z: ((x - 128.0) / 128.0, y, z))

    # Group it into batches.
    dataset = dataset.batch(args.batch_size)

    # Overlap producing and consuming for parallelism.
    dataset = dataset.prefetch(1)

    # Since we repeat the data infinitely, we only need a one-shot iterator.
    image_batch, label_batch, dataset_ids = (
        dataset.make_one_shot_iterator().get_next())

    # This needs a fixed shape.
    dataset_ids.set_shape([args.batch_size])

    # Feed the image through a model.
    model = import_module('networks.' + args.model_type)
    with tf.name_scope('model'):
        net = model.network(image_batch, is_training=True, **args.model_params)

    # Generate a logit for every dataset.
    with tf.name_scope('logits'):
        logits = []
        for d in dataset_configs:
            logits.append(slim.conv2d(
                net, len(d['class_names']),[3,3],
                scope='output_conv_{}'.format(d['dataset_name']),
                activation_fn=None,
                weights_initializer=slim.variance_scaling_initializer(),
                biases_initializer=tf.zeros_initializer()))

    # Create the loss for every dataset.
    with tf.name_scope('losses'):
        loss_function = getattr(output_losses, args.loss_type)
        weighted_losses = []
        for i, dataset_config in enumerate(dataset_configs):
            mask = tf.equal(dataset_ids, i)
            weight = tf.cast(tf.reduce_sum(tf.cast(mask, tf.int32)), tf.float32)
            logit_subset = tf.boolean_mask(logits[i], mask)
            label_subset = tf.boolean_mask(label_batch, mask)

            # Do not evaluate the loss for those datasets without images in the
            # batch.
            zero_mask = tf.equal(weight, 0)
            loss = tf.cond(
                zero_mask,
                lambda: 0.0,
                lambda: tf.reduce_mean(
                    loss_function(logit_subset, label_subset,
                                  void=dataset_config['void_label'])))

            # Normalize with prior
            # loss = tf.divide(
            #    loss, tf.log(float(len(dataset_config['class_names']))))

            summary_loss = tf.cond(zero_mask, lambda: np.nan, lambda: loss)

            tf.summary.scalar(
                'loss_{}'.format(dataset_config['dataset_name']), summary_loss)
            tf.summary.scalar(
                'weight_{}'.format(dataset_config['dataset_name']), weight)

            weighted_losses.append(tf.multiply(loss, weight))

    # Merge all the losses together based on how frequent the underlying
    # datasets are in this batch.
    loss_mean = tf.divide(tf.add_n(weighted_losses), args.batch_size)

    # Some logging for tensorboard.
    tf.summary.scalar('loss', loss_mean)

    # Define the optimizer and the learning-rate schedule.
    # Unfortunately, we get NaNs if we don't handle no-decay separately.
    global_step = tf.Variable(0, name='global_step', trainable=False)
    if 0 <= args.decay_start_iteration < args.train_iterations:
        learning_rate = tf.train.exponential_decay(
            args.learning_rate,
            tf.maximum(0, global_step - args.decay_start_iteration),
            args.train_iterations - args.decay_start_iteration,
            args.decay_multiplier)
    else:
        learning_rate = args.learning_rate
    tf.summary.scalar('learning_rate', learning_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate)

    # Update_ops are used to update batchnorm stats.
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_op = optimizer.minimize(loss_mean, global_step=global_step)

    # Define a saver for the complete model.
    checkpoint_saver = tf.train.Saver(max_to_keep=0)

    with tf.Session() as sess:
        if args.resume:
            # In case we're resuming, simply load the full checkpoint to init.
            last_checkpoint = tf.train.latest_checkpoint(args.experiment_root)
            log.info('Restoring from checkpoint: {}'.format(last_checkpoint))
            checkpoint_saver.restore(sess, last_checkpoint)
        else:
            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # We also store this initialization as a checkpoint, such that we
            # could run exactly reproducible experiments.
            checkpoint_saver.save(sess, os.path.join(
                args.experiment_root, 'checkpoint'), global_step=0)

        merged_summary = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(args.experiment_root, sess.graph)

        start_step = sess.run(global_step)
        log.info('Starting training from iteration {}.'.format(start_step))

        # Finally, here comes the main-loop. This `Uninterrupt` is a handy
        # utility such that an iteration still finishes on Ctrl+C and we can
        # stop the training cleanly.
        with utils.Uninterrupt(sigs=[SIGINT, SIGTERM], verbose=True) as u:
            for i in range(start_step, args.train_iterations):

                # Compute gradients, update weights, store logs!
                start_time = time.time()
                _, summary, step = sess.run(
                    [train_op, merged_summary, global_step])
                elapsed_time = time.time() - start_time

                # Compute the iteration speed and add it to the summary.
                # We did observe some weird spikes that we couldn't track down.
                summary2 = tf.Summary()
                summary2.value.add(
                    tag='secs_per_iter', simple_value=elapsed_time)
                summary_writer.add_summary(summary2, step)
                summary_writer.add_summary(summary, step)

                # Save a checkpoint of training every so often.
                if (args.checkpoint_frequency > 0 and
                        step % args.checkpoint_frequency == 0):
                    checkpoint_saver.save(sess, os.path.join(
                        args.experiment_root, 'checkpoint'), global_step=step)

                # Stop the main-loop at the end of the step, if requested.
                if u.interrupted:
                    log.info('Interrupted on request!')
                    break

        # Store one final checkpoint. This might be redundant, but it is crucial
        # in case intermediate storing was disabled and it saves a checkpoint
        # when the process was interrupted.
        checkpoint_saver.save(sess, os.path.join(
            args.experiment_root, 'checkpoint'), global_step=step)
Beispiel #13
0
def main():
    args = parser.parse_args()

    # We store all arguments in a json file. This has two advantages:
    # 1. We can always get back and see what exactly that experiment was
    # 2. We can resume an experiment as-is without needing to remember all flags.
    args_file = os.path.join(args.experiment_root, 'args.json')
    if args.resume:
        if not os.path.isfile(args_file):
            raise IOError('`args.json` not found in {}'.format(args_file))

        print('Loading args from {}.'.format(args_file))
        with open(args_file, 'r') as f:
            args_resumed = json.load(f)
        args_resumed['resume'] = True  # This would be overwritten.

        # When resuming, we not only want to populate the args object with the
        # values from the file, but we also want to check for some possible
        # conflicts between loaded and given arguments.
        for key, value in args.__dict__.items():
            if key in args_resumed:
                resumed_value = args_resumed[key]
                if resumed_value != value:
                    print('Warning: For the argument `{}` we are using the'
                          ' loaded value `{}`. The provided value was `{}`'
                          '.'.format(key, resumed_value, value))
                    args.__dict__[key] = resumed_value
            else:
                print('Warning: A new argument was added since the last run:'
                      ' `{}`. Using the new value: `{}`.'.format(key, value))

    else:
        # Make sure the required arguments are provided:
        # train_set, dataset_root, dataset_config
        if not args.train_set:
            parser.print_help()
            print('You did not specify the `train_set` argument!')
            exit(1)
        if not args.dataset_root:
            parser.print_help()
            print('You did not specify the required `dataset_root` argument!')
            exit(1)
        if not args.dataset_config:
            parser.print_help()
            print(
                'You did not specify the required `dataset_config` argument!')
            exit(1)

        # If the experiment directory exists already, we bail in fear.
        if os.path.exists(args.experiment_root):
            if os.listdir(args.experiment_root):
                print('The directory {} already exists and is not empty.'
                      ' If you want to resume training, append --resume to'
                      ' your call.'.format(args.experiment_root))
                exit(1)
        else:
            os.makedirs(args.experiment_root)

        # Parse the model parameters. This could be a bit cleaner in the future,
        # but it will do for now.
        if args.model_params is not None:
            model_params = args.model_params.split(',')
            if len(model_params) % 2 != 0:
                raise ValueError('`model_params` has to be a comma separated '
                                 'list of even length.')
            it = iter(model_params)
            args.model_params = {p: int(v) for p, v in zip(it, it)}
        else:
            args.model_params = {}

        # Check some parameter clashes.
        if args.crop_augment > 0 and (args.fixed_crop_augment_width > 0
                                      or args.fixed_crop_augment_height > 0):
            print(
                'You cannot specified the use of both types of crop '
                'augmentations. Either use the `crop_augment` argument to '
                'remove a fixed amount of pixel from the borders, or use the '
                '`fixed_crop_augment_height` arguments to provide a fixed '
                'size window that will be cropped from the input images.')
            exit(1)
        if ((args.fixed_crop_augment_height > 0) !=
            (args.fixed_crop_augment_width > 0)):
            print(
                'You need to specify both the `fixed_crop_augment_width` and '
                '`fixed_crop_augment_height` arguments for a valid '
                'augmentation.')
            exit(1)

        # Store the passed arguments for later resuming and grepping in a nice
        # and readable format.
        with open(args_file, 'w') as f:
            json.dump(vars(args),
                      f,
                      ensure_ascii=False,
                      indent=2,
                      sort_keys=True)

    log_file = os.path.join(args.experiment_root, 'train')
    logging.config.dictConfig(utils.get_logging_dict(log_file))
    log = logging.getLogger('train')

    # Also show all parameter values at the start, for ease of reading logs.
    log.info('Training using the following parameters:')
    for key, value in sorted(vars(args).items()):
        log.info('{}: {}'.format(key, value))

    # Load the config for the dataset.
    with open(args.dataset_config, 'r') as f:
        dataset_config = json.load(f)
    log.info('Training based on a `{}` configuration.'.format(
        dataset_config['dataset_name']))

    # Load the data from the CSV file.
    image_files, label_files = utils.load_dataset(args.train_set,
                                                  args.dataset_root)

    # Setup a tf.Dataset where one "epoch" loops over all images.
    # images are shuffled after every epoch and continue indefinitely.
    images = tf.data.Dataset.from_tensor_slices(image_files)
    labels = tf.data.Dataset.from_tensor_slices(label_files)
    dataset = tf.data.Dataset.zip((images, labels))
    dataset = dataset.shuffle(len(image_files))

    dataset = dataset.repeat(None)  # Repeat forever.

    # Convert filenames to actual image and label id tensors.
    dataset = dataset.map(lambda x, y: tf_utils.string_tuple_to_image_pair(
        x, y, dataset_config.get('original_to_train_mapping', None)),
                          num_parallel_calls=args.loading_threads)

    # Possible augmentations
    if args.flip_augment:
        dataset = dataset.map(tf_utils.flip_augment)
    if args.gamma_augment:
        dataset = dataset.map(tf_utils.gamma_augment)
    if args.crop_augment > 0:
        dataset = dataset.map(lambda x, y: tf_utils.crop_augment(
            x, y, args.crop_augment, args.crop_augment))
    if args.fixed_crop_augment_width > 0 and args.fixed_crop_augment_height > 0:
        dataset = dataset.map(lambda x, y: tf_utils.fixed_crop_augment(
            x, y, args.fixed_crop_augment_height, args.fixed_crop_augment_width
        ))

    # Re scale the input images
    dataset = dataset.map(lambda x, y: ((x - 128.0) / 128.0, y))

    # Group it into batches.
    dataset = dataset.batch(args.batch_size)

    # Overlap producing and consuming for parallelism.
    dataset = dataset.prefetch(1)

    # Since we repeat the data infinitely, we only need a one-shot iterator.
    image_batch, label_batch = dataset.make_one_shot_iterator().get_next()

    model = import_module('networks.' + args.model_type)

    # Feed the image through a model.
    with tf.name_scope('model'):
        net = model.network(image_batch, is_training=True, **args.model_params)
        logits = slim.conv2d(
            net,
            len(dataset_config['class_names']), [3, 3],
            scope='output_conv',
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer(),
            biases_initializer=tf.zeros_initializer())

    # Create the loss, for now we use a simple cross entropy loss.
    with tf.name_scope('loss'):
        loss_function = getattr(output_losses, args.loss_type)
        losses = loss_function(logits,
                               label_batch,
                               void=dataset_config['void_label'])

    # Count the total batch loss.
    loss_mean = tf.reduce_mean(losses)

    # Some logging for tensorboard.
    tf.summary.histogram('loss_distribution', losses)
    tf.summary.scalar('loss', loss_mean)

    # Define the optimizer and the learning-rate schedule.
    # Unfortunately, we get NaNs if we don't handle no-decay separately.
    global_step = tf.Variable(0, name='global_step', trainable=False)
    if 0 <= args.decay_start_iteration < args.train_iterations:
        learning_rate = tf.train.exponential_decay(
            args.learning_rate,
            tf.maximum(0, global_step - args.decay_start_iteration),
            args.train_iterations - args.decay_start_iteration,
            args.decay_multiplier)
    else:
        learning_rate = args.learning_rate
    tf.summary.scalar('learning_rate', learning_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate)

    # Update_ops are used to update batchnorm stats.
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_op = optimizer.minimize(loss_mean, global_step=global_step)

    # Define a saver for the complete model.
    checkpoint_saver = tf.train.Saver(max_to_keep=0)

    with tf.Session() as sess:
        if args.resume:
            # In case we're resuming, simply load the full checkpoint to init.
            last_checkpoint = tf.train.latest_checkpoint(args.experiment_root)
            log.info('Restoring from checkpoint: {}'.format(last_checkpoint))
            checkpoint_saver.restore(sess, last_checkpoint)
        else:
            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # We also store this initialization as a checkpoint, such that we
            # could run exactly reproduceable experiments.
            checkpoint_saver.save(sess,
                                  os.path.join(args.experiment_root,
                                               'checkpoint'),
                                  global_step=0)

        merged_summary = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(args.experiment_root,
                                               sess.graph)

        start_step = sess.run(global_step)
        log.info('Starting training from iteration {}.'.format(start_step))

        # Finally, here comes the main-loop. This `Uninterrupt` is a handy
        # utility such that an iteration still finishes on Ctrl+C and we can
        # stop the training cleanly.
        with utils.Uninterrupt(sigs=[SIGINT, SIGTERM], verbose=True) as u:
            for i in range(start_step, args.train_iterations):

                # Compute gradients, update weights, store logs!
                start_time = time.time()
                _, summary, step = sess.run(
                    [train_op, merged_summary, global_step])
                elapsed_time = time.time() - start_time

                # Compute the iteration speed and add it to the summary.
                # We did observe some weird spikes that we couldn't track down.
                summary2 = tf.Summary()
                summary2.value.add(tag='secs_per_iter',
                                   simple_value=elapsed_time)
                summary_writer.add_summary(summary2, step)
                summary_writer.add_summary(summary, step)

                # Save a checkpoint of training every so often.
                if (args.checkpoint_frequency > 0
                        and step % args.checkpoint_frequency == 0):
                    checkpoint_saver.save(sess,
                                          os.path.join(args.experiment_root,
                                                       'checkpoint'),
                                          global_step=step)

                # Stop the main-loop at the end of the step, if requested.
                if u.interrupted:
                    log.info('Interrupted on request!')
                    break

        # Store one final checkpoint. This might be redundant, but it is crucial
        # in case intermediate storing was disabled and it saves a checkpoint
        # when the process was interrupted.
        checkpoint_saver.save(sess,
                              os.path.join(args.experiment_root, 'checkpoint'),
                              global_step=step)
Beispiel #14
0
def head(endpoints, embedding_dim, is_training):

    batch_norm_params = {
        'decay': 0.9,
        'epsilon': 1e-5,
        'scale': True,
        'updates_collections': tf.GraphKeys.UPDATE_OPS,
        'fused': None,
    }
    with slim.arg_scope(
        [slim.conv2d],
            weights_regularizer=slim.l2_regularizer(0.0),
            weights_initializer=slim.variance_scaling_initializer(),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            # attention_projection = slim.conv2d(endpoints['Mixed_7d'], 512, [1, 1], scope='attention_projection')
            masks = []
            masked_maps = []
            for i in range(head_num):
                attention_branch_mask = attention_branch(
                    endpoints['Mixed_7d'], i)
                # attention_branch_mask = attention_branch(attention_projection, i)
                masks.append(attention_branch_mask)
                endpoints['attention_mask{}'.format(i)] = attention_branch_mask
                masked_map = (1 +
                              attention_branch_mask) * endpoints['Mixed_7d']
                # masked_map = (1 + attention_branch_mask) * attention_projection
                masked_maps.append(masked_map)

            for i in range(head_num):
                for j in range(i + 1, head_num):
                    cosine_similarity(masks[i], masks[j],
                                      'constraint_{}{}'.format(i, j))

    _masked = tf.concat(masked_maps, axis=3, name='concated_mask')

    endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean(
        _masked, [1, 2], name='_pool5', keep_dims=False)

    endpoints['head_output'] = slim.fully_connected(
        endpoints['model_output'],
        1024,
        normalizer_fn=slim.batch_norm,
        normalizer_params={
            'decay': 0.9,
            'epsilon': 1e-5,
            'scale': True,
            'is_training': is_training,
            'updates_collections': tf.GraphKeys.UPDATE_OPS,
        })

    endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected(
        endpoints['head_output'],
        embedding_dim,
        activation_fn=None,
        weights_initializer=tf.orthogonal_initializer(),
        scope='emb')

    return endpoints
Beispiel #15
0
    def _recognition_network(self, sampler=None, log_likelihood_func=None):
        """x values -> samples from Q and return log Q(h|x)."""
        samples = {}
        reuse = None if not self.run_recognition_network else True

        # Set defaults
        if sampler is None:
            sampler = self._random_sample

        if log_likelihood_func is None:
            log_likelihood_func = lambda sample, log_params: (
                U.binary_log_likelihood(sample['activation'], log_params))

        logQ = []

        if self.hparams.task in ['sbn', 'omni']:
            # Initialize the edge case
            samples[-1] = {'activation': self._x}
            if self.mean_xs is not None:
                samples[-1]['activation'] -= self.mean_xs  # center the input
            samples[-1]['activation'] = (samples[-1]['activation'] + 1) / 2.0

            with slim.arg_scope(
                [slim.fully_connected],
                    weights_initializer=slim.variance_scaling_initializer(),
                    variables_collections=[Q_COLLECTION]):
                for i in xrange(self.hparams.n_layer):
                    # Set up the input to the layer
                    input = 2.0 * samples[i - 1]['activation'] - 1.0

                    # Create the conditional distribution (output is the logits)
                    h = self._create_transformation(
                        input,
                        n_output=self.hparams.n_hidden,
                        reuse=reuse,
                        scope_prefix='q_%d' % i)

                    samples[i] = sampler(h, self.uniform_samples[i], i)
                    logQ.append(log_likelihood_func(samples[i], h))

            self.run_recognition_network = True
            return logQ, samples
        elif self.hparams.task == 'sp':
            # Initialize the edge case
            samples[-1] = {
                'activation': tf.split(self._x, num_or_size_splits=2,
                                       axis=1)[0]
            }  # top half of digit
            if self.mean_xs is not None:
                samples[-1]['activation'] -= np.split(self.mean_xs, 2,
                                                      0)[0]  # center the input
            samples[-1]['activation'] = (samples[-1]['activation'] + 1) / 2.0

            with slim.arg_scope(
                [slim.fully_connected],
                    weights_initializer=slim.variance_scaling_initializer(),
                    variables_collections=[Q_COLLECTION]):
                for i in xrange(self.hparams.n_layer):
                    # Set up the input to the layer
                    input = 2.0 * samples[i - 1]['activation'] - 1.0

                    # Create the conditional distribution (output is the logits)
                    h = self._create_transformation(
                        input,
                        n_output=self.hparams.n_hidden,
                        reuse=reuse,
                        scope_prefix='q_%d' % i)

                    samples[i] = sampler(h, self.uniform_samples[i], i)
                    logQ.append(log_likelihood_func(samples[i], h))

            self.run_recognition_network = True
            return logQ, samples
Beispiel #16
0
def head(endpoints, embedding_dim, is_training):

    M = 5
    L = M * M
    D = 64
    dim = [L, D]
    attention_steps = 16

    # endpoints['resnet_v2_50/block4'] is in shape of (?, 7, 7, 2048)
    batch_norm_params = {
        'decay': 0.9,
        'epsilon': 1e-5,
        'scale': True,
        'updates_collections': tf.GraphKeys.UPDATE_OPS,
        'fused': None,
    }
    with slim.arg_scope(
        [slim.conv2d, slim.fully_connected],
            weights_regularizer=slim.l2_regularizer(0.0),
            weights_initializer=slim.variance_scaling_initializer(),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            attention_branch_conv = slim.conv2d(endpoints['Mixed_7d'],
                                                dim[1], [1, 1],
                                                scope='attention_branch_conv')
            # create a BasicRNNCell
            features = tf.reshape(attention_branch_conv, [-1, dim[0], dim[1]],
                                  name='attention_branch_features')
            a_i = tf.reshape(features, [-1, dim[1]])
            a_i = slim.fully_connected(inputs=a_i,
                                       num_outputs=dim[1],
                                       biases_initializer=None,
                                       scope='a_i')
            a_i = tf.reshape(a_i, [-1, dim[0], dim[1]])
            gru_cell = tf.contrib.rnn.GRUCell(num_units=dim[1])

            # defining initial state
            # state = gru_cell.zero_state(tf.shape(endpoints['resnet_v2_50/block4'])[0], dtype=tf.float32)
            _input = tf.reduce_mean(features, 1)
            state = slim.fully_connected(inputs=tf.reduce_mean(features, 1),
                                         num_outputs=D,
                                         biases_initializer=None,
                                         scope='init_state')

            attention_maps = []
            _masked = []
            _masked.append(features)

            with tf.variable_scope("GRU_Attention"):
                for i in range(attention_steps):
                    if i > 0: tf.get_variable_scope().reuse_variables()
                    # state is in shape (?, 64)
                    output, state = gru_cell(_input, state)
                    h = tf.expand_dims(
                        slim.fully_connected(inputs=state,
                                             num_outputs=dim[1],
                                             biases_initializer=None,
                                             scope='hidden2h'), 1)
                    e = tf.reshape(tf.add(a_i, h), [-1, dim[1]])
                    _att = slim.fully_connected(inputs=e,
                                                num_outputs=1,
                                                scope='e2attention')
                    _alpha = tf.nn.softmax(tf.reshape(_att, [-1, dim[0]]))
                    attention_maps.append(_alpha)
                    _mask = tf.multiply(features, tf.expand_dims(_alpha, 2))
                    _masked.append(_mask)
                    _input = tf.reduce_sum(_mask, 1)
            '''
            for i in range(attention_steps - 1):
                if i > 0: tf.get_variable_scope().reuse_variables()
                _inputs.append(_input)
                output, state = gru_cell(_input, state)
                h = tf.expand_dims(slim.fully_connected(inputs=state, num_outputs=dim[1], biases_initializer=None, scope='hidden2h'), 1)
                e = tf.reshape(tf.add(a_i, h), [-1, dim[1]])
                _att = slim.fully_connected(inputs=e, num_outputs=1, scope='e2attention')
                _alpha = tf.nn.softmax(tf.reshape(_att, [-1, dim[0]]))
                attention_maps.append(_alpha)
                _input = tf.reduce_sum(tf.multiply(features, tf.expand_dims(_alpha, 2)), 1) 
            '''

    _mask_concat = tf.concat(_masked[:-1], 2)
    _masked = tf.reshape(_mask_concat, [-1, M, M, attention_steps * dim[1]])

    endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean(
        _masked, [1, 2], name='_pool5', keep_dims=False)

    endpoints['head_output'] = slim.fully_connected(
        endpoints['model_output'],
        1024,
        normalizer_fn=slim.batch_norm,
        normalizer_params={
            'decay': 0.9,
            'epsilon': 1e-5,
            'scale': True,
            'is_training': is_training,
            'updates_collections': tf.GraphKeys.UPDATE_OPS,
        })

    endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected(
        endpoints['head_output'],
        embedding_dim,
        activation_fn=None,
        weights_initializer=tf.orthogonal_initializer(),
        scope='emb')

    return endpoints
def build_bisenet3(inputs,
                   num_classes,
                   preset_model='DepthwiseAAFF',
                   frontend="xception",
                   weight_decay=1e-5,
                   is_training=True,
                   pretrained_dir="models"):

    initializer = slim.variance_scaling_initializer(factor=2.0,
                                                    mode='FAN_IN',
                                                    uniform=False)

    ### The spatial path
    ### The number of feature maps for each convolution is not specified in the paper
    ### It was chosen here to be equal to the number of feature maps of a classification
    ### model at each corresponding stage

    # depth-wise convolution
    point_filter1 = tf.get_variable(name="point_filter1",
                                    shape=(1, 1, 64, 128),
                                    initializer=initializer)
    point_filter2 = tf.get_variable(name="point_filter2",
                                    shape=(1, 1, 128, 256),
                                    initializer=initializer)
    filter1 = tf.get_variable(name="filter1",
                              shape=(3, 3, 64, 1),
                              initializer=initializer)
    filter2 = tf.get_variable(name="filter2",
                              shape=(3, 3, 128, 1),
                              initializer=initializer)
    # spatial path
    spatial_net = ConvBlock(inputs,
                            n_filters=64,
                            kernel_size=[3, 3],
                            strides=2)
    spatial_net = tf.nn.separable_conv2d(input=spatial_net,
                                         depthwise_filter=filter1,
                                         pointwise_filter=point_filter1,
                                         strides=[1, 2, 2, 1],
                                         rate=[1, 1],
                                         padding='SAME')
    spatial_net = tf.nn.separable_conv2d(input=spatial_net,
                                         depthwise_filter=filter2,
                                         pointwise_filter=point_filter2,
                                         strides=[1, 2, 2, 1],
                                         rate=[1, 1],
                                         padding='SAME')
    spatial_net = ConvBlock(spatial_net, n_filters=32, kernel_size=[1, 1])

    # Context path
    logits, end_points, frontend_scope, init_fn = frontend_builder.build_frontend(
        inputs,
        frontend,
        pretrained_dir=pretrained_dir,
        is_training=is_training)

    size = tf.shape(end_points['pool5'])[1:3]

    net_1 = AttentionAndFeatureFussion(end_points['pool3'],
                                       end_points['pool4'], 64)
    net_2 = AttentionAndFeatureFussion(net_1, end_points['pool5'], 128)
    net_2 = Upsampling(net_2, scale=2)
    net_1_2 = tf.concat([net_1, net_2], axis=-1)
    net_1_2 = Upsampling(net_1_2, scale=2)
    net_1_2_3 = tf.concat([net_1_2, end_points['pool3']], axis=-1)
    net_1_2_3 = ConvBlock(net_1_2_3,
                          n_filters=128,
                          kernel_size=[1, 1],
                          strides=1)
    context_path_left = AttentionRefinementModule(net_1_2_3, n_filters=128)

    net_3 = AttentionAndFeatureFussion(end_points['pool3'],
                                       end_points['pool4'], 64)
    net_4 = AttentionAndFeatureFussion(net_3, end_points['pool5'], 128)
    net_4 = Upsampling(net_4, scale=2)
    net_3_4 = tf.concat([net_3, net_4], axis=-1)
    net_3_4 = Upsampling(net_3_4, scale=2)
    net_3_4_5 = tf.concat([net_3_4, end_points['pool3']], axis=-1)
    net_3_4_5 = ConvBlock(net_3_4_5,
                          n_filters=128,
                          kernel_size=[1, 1],
                          strides=1)
    context_path_right = AttentionRefinementModule(net_3_4_5, n_filters=128)

    ### Combining the paths
    net = FeatureFusionModule(input_1=context_path_left,
                              input_2=context_path_right,
                              input_3=spatial_net,
                              n_filters=256)
    net = ConvBlock(net, n_filters=64, kernel_size=[3, 3])

    ### Final upscaling and finish # Upsampling + dilation or only Upsampling
    net = Upsampling(net, scale=2)
    net = slim.conv2d(net,
                      64, [3, 3],
                      rate=2,
                      activation_fn=tf.nn.relu,
                      biases_initializer=None,
                      normalizer_fn=slim.batch_norm)
    net = slim.conv2d(net,
                      num_classes, [1, 1],
                      activation_fn=None,
                      scope='logits')
    net = Upsampling(net, 4)

    return net, init_fn
Beispiel #18
0
def convolutional_alexnet_arg_scope(embed_config,
                                    trainable=True,
                                    is_training=False):
    """Defines the default arg scope.

  Args:
    embed_config: A dictionary which contains configurations for the embedding function.
    trainable: If the weights in the embedding function is trainable.
    is_training: If the embedding function is built for training.

  Returns:
    An `arg_scope` to use for the convolutional_alexnet models.
  """
    # Only consider the model to be in training mode if it's trainable.
    # This is vital for batch_norm since moving_mean and moving_variance
    # will get updated even if not trainable.
    is_model_training = trainable and is_training

    if get(embed_config, 'use_bn', True):
        #print("========= use bn")
        batch_norm_scale = get(embed_config, 'bn_scale', True)
        batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4)
        batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6)
        batch_norm_params = {
            "scale": batch_norm_scale,
            # Decay for the moving averages.
            "decay": batch_norm_decay,
            # Epsilon to prevent 0s in variance.
            "epsilon": batch_norm_epsilon,
            "trainable": trainable,
            "is_training": is_model_training,
            # Collection containing the moving mean and moving variance.
            "variables_collections": {
                "beta": None,
                "gamma": None,
                "moving_mean": ["moving_vars"],
                "moving_variance": ["moving_vars"],
            },
            'updates_collections':
            None,  # Ensure that updates are done within a frame
        }
        normalizer_fn = slim.batch_norm
    else:
        batch_norm_params = {}
        normalizer_fn = None

    weight_decay = get(embed_config, 'weight_decay', 5e-4)
    if trainable:
        weights_regularizer = slim.l2_regularizer(weight_decay)
    else:
        weights_regularizer = None

    init_method = get(embed_config, 'init_method', 'kaiming_normal')
    if is_model_training:
        logging.info('embedding init method -- {}'.format(init_method))
    if init_method == 'kaiming_normal':
        # The same setting as siamese-fc
        initializer = slim.variance_scaling_initializer(factor=2.0,
                                                        mode='FAN_OUT',
                                                        uniform=False)
    else:
        initializer = slim.xavier_initializer()

    with slim.arg_scope(
        [slim.conv2d],  # no slim.separable_conv2d
            weights_regularizer=weights_regularizer,
            weights_initializer=initializer,
            padding='VALID',
            trainable=trainable,
            activation_fn=tf.nn.relu,
            normalizer_fn=normalizer_fn,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            with slim.arg_scope([slim.batch_norm],
                                is_training=is_model_training) as arg_sc:
                return arg_sc
def inception(input, is_training):
    weight_decay = 0.0005
    keep_prob = 0.5
    ##batch normalization 參數定義
    batch_norm_decay = 0.996
    batch_norm_epsilon = 1e-5
    batch_norm_scale = True
    batch_norm_params = {
        'decay': batch_norm_decay,
        'epsilon': batch_norm_epsilon,
        'scale': batch_norm_scale,
        'updates_collections': tf.GraphKeys.UPDATE_OPS,
        'is_training': is_training
    }
    ## CNN 架構
    with slim.arg_scope(
        [slim.conv2d, slim.fully_connected],
            activation_fn=tf.nn.relu,
            weights_regularizer=slim.l2_regularizer(weight_decay),
            weights_initializer=slim.variance_scaling_initializer(),
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.dropout],
                            keep_prob=keep_prob,
                            is_training=is_training):
            with slim.arg_scope([slim.max_pool2d],
                                kernel_size=[2, 2],
                                stride=[2, 2]):
                with slim.arg_scope([slim.conv2d], padding='SAME'):
                    net = slim.conv2d(input, 4, [3, 3])
                    net = slim.conv2d(net, 8, [3, 3])
                    net = slim.conv2d(net, 16, [3, 3])
                    net = slim.max_pool2d(net)
            with slim.arg_scope(
                [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                    stride=1,
                    padding='SAME'):
                with tf.variable_scope('Mixed_1'):
                    with tf.variable_scope('Branch_0'):
                        branch_0 = slim.conv2d(net, 8, [1, 1])
                    with tf.variable_scope('Branch_1'):
                        branch_1 = slim.conv2d(net, 16, [1, 1])
                        branch_1 = slim.conv2d(branch_1, 32, [3, 3])
                    with tf.variable_scope('Branch_2'):
                        branch_2 = slim.conv2d(net, 8, [1, 1])
                        branch_2 = slim.conv2d(branch_2, 16, [5, 5])
                    with tf.variable_scope('Branch_3'):
                        branch_3 = slim.max_pool2d(net, [2, 2],
                                                   stride=[1, 1],
                                                   padding='SAME')
                        branch_3 = slim.conv2d(branch_3, 16, [1, 1])
                    net = tf.concat([branch_0, branch_1, branch_2, branch_3],
                                    axis=3)
                with tf.variable_scope('Mixed_2'):
                    with tf.variable_scope('Branch_0'):
                        branch_0 = slim.conv2d(net, 16, [1, 1])
                    with tf.variable_scope('Branch_1'):
                        branch_1 = slim.conv2d(net, 32, [1, 1])
                        branch_1 = slim.conv2d(branch_1, 64, [3, 3])
                    with tf.variable_scope('Branch_2'):
                        branch_2 = slim.conv2d(net, 32, [1, 1])
                        branch_2 = slim.conv2d(branch_2, 64, [5, 5])
                    with tf.variable_scope('Branch_3'):
                        branch_3 = slim.max_pool2d(net, [2, 2],
                                                   stride=[1, 1],
                                                   padding='SAME')
                        branch_3 = slim.conv2d(branch_3, 32, [1, 1])
                    net = tf.concat([branch_0, branch_1, branch_2, branch_3],
                                    axis=3)
                net = slim.conv2d(net, 2, [1, 1], activation_fn=None)
                net = slim.avg_pool2d(net,
                                      kernel_size=[net.shape[1], net.shape[2]],
                                      stride=[1, 1],
                                      padding='VALID')
                net = tf.reshape(net, [-1, 2])
                logits = tf.nn.softmax(net)

                return logits
def network(input,
            is_training,
            base_channel_count=48,
            bottleneck_blocks=False,
            separable_conv=False,
            gn_groups=None,
            gn_channels=None):
    '''ResNet v2 style semantic segmentation network with long range skips.

    Args:

    Returns:

    '''
    conv2d_params = {
        'padding': 'SAME',
        'weights_initializer': slim.variance_scaling_initializer(),
        'biases_initializer': None,
        'activation_fn': None,
        'normalizer_fn': None
    }

    if gn_groups is not None or gn_channels is not None:
        normalziation_params = {
            'group_count': gn_groups,
            'channel_count': gn_channels
        }
        norm_op = tf_utils.group_normalization
    else:
        normalziation_params = {
            'center': True,
            'scale': True,
            'decay': 0.9,
            'epsilon': 1e-5,
            'is_training': is_training
        }
        norm_op = slim.batch_norm

    if separable_conv:
        separable_conv2d_params = dict(conv2d_params)
        separable_conv2d_params['depth_multiplier'] = 1
        conv_op = slim.separable_conv2d
    else:
        separable_conv2d_params = {}
        conv_op = slim.conv2d

    with slim.arg_scope([slim.conv2d], **conv2d_params):
        with slim.arg_scope([slim.separable_conv2d],
                            **separable_conv2d_params):
            with slim.arg_scope([norm_op], **normalziation_params):
                # First convolution to increase the channel count.
                net = slim.conv2d(input,
                                  base_channel_count, [3, 3],
                                  scope='input_conv')

                # 2 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=1,
                                   scope='resblock_v2_1',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=1,
                                   scope='resblock_v2_2',
                                   bottleneck=bottleneck_blocks)
                skip0 = net

                # Pooling -> 1/2 res
                net = slim.max_pool2d(net, [2, 2], padding='SAME')

                # 3 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=2,
                                   scope='resblock_v2_3',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=2,
                                   scope='resblock_v2_4',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=2,
                                   scope='resblock_v2_5',
                                   bottleneck=bottleneck_blocks)
                skip1 = net

                # Pooling -> 1/4 res
                net = slim.max_pool2d(net, [2, 2], padding='SAME')

                # 4 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_6',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_7',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_8',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_9',
                                   bottleneck=bottleneck_blocks)
                skip2 = net

                # Pooling -> 1/8 res
                net = slim.max_pool2d(net, [2, 2], padding='SAME')

                # 2 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=8,
                                   scope='resblock_v2_10',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=8,
                                   scope='resblock_v2_11',
                                   bottleneck=bottleneck_blocks)
                skip3 = net

                # Pooling -> 1/16 res
                net = slim.max_pool2d(net, [2, 2], padding='SAME')

                # 2 ResBlocks
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=8,
                                   scope='resblock_v2_12',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=8,
                                   scope='resblock_v2_13',
                                   bottleneck=bottleneck_blocks)

                # Unpool, crop and concatenate the skip connection
                net = tf.image.resize_nearest_neighbor(
                    net, [tf.shape(net)[1] * 2,
                          tf.shape(net)[2] * 2])
                net = net[:, :tf.shape(skip3)[1], :tf.shape(skip3)[2], :]
                net = tf.concat([net, skip3], axis=-1)

                # 2 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_14',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_15',
                                   bottleneck=bottleneck_blocks)

                # Unpool, crop and concatenate the skip connection
                net = tf.image.resize_nearest_neighbor(
                    net, [tf.shape(net)[1] * 2,
                          tf.shape(net)[2] * 2])
                net = net[:, :tf.shape(skip2)[1], :tf.shape(skip2)[2], :]
                net = tf.concat([net, skip2], axis=-1)

                # 2 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_16',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=4,
                                   scope='resblock_v2_17',
                                   bottleneck=bottleneck_blocks)

                # Unpool, crop and concatenate the skip connection
                net = tf.image.resize_nearest_neighbor(
                    net, [tf.shape(net)[1] * 2,
                          tf.shape(net)[2] * 2])
                net = net[:, :tf.shape(skip1)[1], :tf.shape(skip1)[2], :]
                net = tf.concat([net, skip1], axis=-1)

                # 2 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=2,
                                   scope='resblock_v2_18',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=2,
                                   scope='resblock_v2_19',
                                   bottleneck=bottleneck_blocks)

                # Unpool, crop and concatenate the skip connection
                net = tf.image.resize_nearest_neighbor(
                    net, [tf.shape(net)[1] * 2,
                          tf.shape(net)[2] * 2])
                net = net[:, :tf.shape(skip0)[1], :tf.shape(skip0)[2], :]
                net = tf.concat([net, skip0], axis=-1)

                # 2 ResBlocks, store the output for the skip connection
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=1,
                                   scope='resblock_v2_20',
                                   bottleneck=bottleneck_blocks)
                net = res_block_v2(net,
                                   base_channel_count,
                                   conv_op,
                                   norm_op,
                                   channel_multiplier=1,
                                   scope='resblock_v2_21',
                                   bottleneck=bottleneck_blocks)

                # Final batchnorm and relu before the prediction.
                net = slim.batch_norm(net)
                net = tf.nn.relu(net)

                return net
def build_graph(reader,
                model,
                eval_data_pattern,
                label_loss_fn,
                batch_size=1024,
                num_readers=1):
    """Creates the Tensorflow graph for evaluation.

    Args:
      reader: The data file reader. It should inherit from BaseReader.
      model: The core model (e.g. logistic or neural net). It should inherit from
        BaseModel.
      eval_data_pattern: glob path to the evaluation data files.
      label_loss_fn: What kind of loss to apply to the model. It should inherit
        from BaseLoss.
      batch_size: How many examples to process at a time.
      num_readers: How many threads to use for I/O operations.
    """

    global_step = tf.Variable(0, trainable=False, name="global_step")
    input_data_dict = get_input_evaluation_tensors(reader,
                                                   eval_data_pattern,
                                                   batch_size=batch_size,
                                                   num_readers=num_readers)
    video_id_batch = input_data_dict["video_ids"]
    model_input_raw = input_data_dict["video_matrix"]
    labels_batch = input_data_dict["labels"]
    num_frames = input_data_dict["num_frames"]
    tf.summary.histogram("model_input_raw", model_input_raw)

    local_device_protos = device_lib.list_local_devices()
    gpus = [x.name for x in local_device_protos if x.device_type == "GPU"]
    gpus = gpus[:FLAGS.num_gpu]
    num_gpus = len(gpus)

    if num_gpus > 0:
        logging.info("Using the following GPUs to train: " + str(gpus))
        num_towers = num_gpus
        device_string = "/gpu:%d"
    else:
        logging.info("No GPUs found. Training on CPU.")
        num_towers = 1
        device_string = "/cpu:%d"
    print("flags!!!", device_string)
    # feature_dim = len(model_input_raw.get_shape()) - 1

    # Normalize input features.
    # model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)

    if FLAGS.segment_labels:
        label_weights = input_data_dict["label_weights"]
    else:
        label_weights = None

    offset = np.array([4. / 512] * 1024 + [0] * 128)
    offset = tf.constant(offset, dtype=tf.float32)

    eigen_val = tf.constant(np.sqrt(
        np.load("yt8m_pca/eigenvals.npy")[:1024, 0]),
                            dtype=tf.float32)
    model_input = tf.multiply(
        model_input_raw - offset,
        tf.pad(eigen_val + 1e-4, [[0, 128]], constant_values=1.))

    tower_logits = []

    for i in range(num_towers):
        with tf.device(device_string % i):
            with tf.variable_scope("tower_%d" % i, reuse=False):
                result = model.create_model(model_input,
                                            num_frames=num_frames,
                                            vocab_size=reader.num_classes,
                                            labels=labels_batch,
                                            is_training=False)
                logits = result["logits"]
                tower_logits.append(logits)

    with tf.device(device_string % 0):
        with tf.variable_scope("ensemble"):
            ftr_mean = tf.reduce_mean(model_input, axis=1)
            print("ftr mean shape: ", ftr_mean.get_shape().as_list())
            ftr_mean = slim.batch_norm(ftr_mean,
                                       center=True,
                                       scale=True,
                                       fused=False,
                                       is_training=False,
                                       scope="mix_weights_bn")
            mix_weights = slim.fully_connected(
                ftr_mean,
                num_towers,
                activation_fn=None,
                weights_initializer=slim.variance_scaling_initializer(),
                scope="mix_weights")
            mix_weights = tf.nn.softmax(mix_weights, axis=-1)
            tf.summary.histogram("mix_weights", mix_weights)

            logits = tf.stack(tower_logits, axis=1)
            final_logit = tf.reduce_sum(tf.multiply(
                logits, tf.expand_dims(mix_weights, axis=-1)),
                                        axis=1,
                                        keepdims=False)
            final_predictions = tf.nn.sigmoid(final_logit)

        final_label_loss = label_loss_fn.calculate_loss(
            final_predictions, labels_batch, label_weights=label_weights)

        tf.summary.scalar("label_loss", final_label_loss)
        tf.add_to_collection("global_step", global_step)
        tf.add_to_collection("loss", final_label_loss)
        tf.add_to_collection("predictions", final_predictions)
        tf.add_to_collection("input_batch", model_input)
        tf.add_to_collection("input_batch_raw", model_input_raw)
        tf.add_to_collection("video_id_batch", video_id_batch)
        tf.add_to_collection("num_frames", num_frames)
        tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
        if FLAGS.segment_labels:
            tf.add_to_collection("label_weights",
                                 input_data_dict["label_weights"])
        tf.add_to_collection("summary_op", tf.summary.merge_all())
def net_structure(img1, img2):
    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                        # He (aka MSRA) weight initialization
                        weights_initializer=slim.variance_scaling_initializer(),
                        activation_fn=LeakyReLU,
                        # We will do our own padding to match the original Caffe code
                        padding='VALID'):
        weights_regularizer = slim.l2_regularizer(weight_decay)
        with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
            with slim.arg_scope([slim.conv2d], stride=2):
                conv_a_1 = slim.conv2d(pad(img1, 3), 64, 7, scope='conv1')
                conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope='conv2')
                conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope='conv3')

                conv_b_1 = slim.conv2d(pad(img2, 3), 64, 7, scope='conv1', reuse=True)
                conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope='conv2', reuse=True)
                conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope='conv3', reuse=True)

                # Compute cross correlation with leaky relu activation
                cc = correlation.correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20)
                cc_relu = LeakyReLU(cc)

            # Combine cross correlation results with convolution of feature map A
            netA_conv = slim.conv2d(conv_a_3, 32, 1, scope='conv_redir')
            # Concatenate along the channels axis
            net = tf.concat([netA_conv, cc_relu], axis=3)

            conv3_1 = slim.conv2d(pad(net), 256, 3, scope='conv3_1')
            with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
                conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
                conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
                conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
                conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
            conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
            conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')

            """ START: Refinement Network """
            with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
                predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
                                            scope='predict_flow6',
                                            activation_fn=None)
                deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
                                                        stride=2,
                                                        scope='deconv5'))
                upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
                                                                  stride=2,
                                                                  scope='upsample_flow6to5',
                                                                  activation_fn=None))
                concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)

                predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
                                            scope='predict_flow5',
                                            activation_fn=None)
                deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
                                                        stride=2,
                                                        scope='deconv4'))
                upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
                                                                  stride=2,
                                                                  scope='upsample_flow5to4',
                                                                  activation_fn=None))
                concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)

                predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
                                            scope='predict_flow4',
                                            activation_fn=None)
                deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
                                                        stride=2,
                                                        scope='deconv3'))
                upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
                                                                  stride=2,
                                                                  scope='upsample_flow4to3',
                                                                  activation_fn=None))
                concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)

                predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
                                            scope='predict_flow3',
                                            activation_fn=None)
                deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
                                                        stride=2,
                                                        scope='deconv2'))
                upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
                                                                  stride=2,
                                                                  scope='upsample_flow3to2',
                                                                  activation_fn=None))
                concat2 = tf.concat([conv_a_2, deconv2, upsample_flow3to2], axis=3)

                predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
                                            scope='predict_flow2',
                                            activation_fn=None)
            """ END: Refinement Network """

            '''new loss'''
            # target_height, target_width = int(predict_flow2.shape[1].value), int(predict_flow2.shape[2].value)
            # predict_flow6 = tf.image.resize_bilinear(predict_flow6,
            #                                          tf.stack([target_height, target_width]),
            #                                          align_corners=True)
            # predict_flow5 = tf.image.resize_bilinear(predict_flow5,
            #                                          tf.stack([target_height, target_width]),
            #                                          align_corners=True)
            # predict_flow4 = tf.image.resize_bilinear(predict_flow4,
            #                                          tf.stack([target_height, target_width]),
            #                                          align_corners=True)
            # predict_flow3 = tf.image.resize_bilinear(predict_flow3,
            #                                          tf.stack([target_height, target_width]),
            #                                          align_corners=True)
            # predict = tf.concat([predict_flow5, predict_flow4, predict_flow3, predict_flow2], axis=3)
            # flow = predict * 20.0
            # flow_temp0 = slim.conv2d(pad(predict), num_outputs=2, kernel_size=2, stride=1, scope='flow_temp0')
            # flow_temp = tf.image.resize_bilinear(flow_temp0,
            #                                      tf.stack([img_height, img_width]),
            #                                      align_corners=True)
            # flow = flow_temp * 20.0

            flow = predict_flow2 * 20.0
            # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
            flow = tf.image.resize_bilinear(flow,
                                            tf.stack([img_height, img_width]),
                                            align_corners=True)

            return {
                'predict_flow6': predict_flow6,
                'predict_flow5': predict_flow5,
                'predict_flow4': predict_flow4,
                'predict_flow3': predict_flow3,
                'predict_flow2': predict_flow2,
                'flow': flow,
            }
        def netbody(img, reuse=False):
            with tf.variable_scope('ResNet', reuse=reuse):
                net = img
                with slim.arg_scope(
                    [slim.conv2d],
                        padding='SAME',
                        kernel_size=[3, 3],
                        activation_fn=tf.nn.relu,
                        weights_initializer=slim.variance_scaling_initializer(
                        ),
                        normalizer_fn=self.BN if self.bn else None,
                        normalizer_params={
                            'is_training': is_training,
                            'decay': self.bn_decay,
                            'reuse': reuse
                        } if self.bn else None):
                    net = slim.conv2d(net, self.chnl['block1'], scope='conv1')
                    shortcut = net
                    # ep['conv1'] = net
                    for blk, name in enumerate(self.block):
                        n = self.n[name]
                        chnl = self.chnl[name]

                        with tf.variable_scope(name):
                            self.prune[name] = tf.Variable(np.ones(
                                (n, 1, 1, chnl), dtype=np.float32),
                                                           trainable=False,
                                                           name='prune')
                            self.prune['ph' + name] = tf.placeholder(
                                tf.float32, shape=[n, 1, 1, chnl])
                            self.prune['asn' + name] = tf.assign(
                                self.prune[name], self.prune['ph' + name])
                            prune = tf.split(self.prune[self.block[blk]], n)
                            logger.info(name)
                            for i in range(n):
                                with tf.variable_scope('unit' + str(i),
                                                       reuse=reuse):
                                    if blk != 0 and i == 0:
                                        # no additional paras and computations shortcut
                                        shortcut = tf.nn.avg_pool(
                                            shortcut, [1, 2, 2, 1],
                                            [1, 2, 2, 1], 'SAME')
                                        shortcut = tf.concat(
                                            [shortcut, shortcut * 0.], 3)
                                        net = shortcut * prune[i]
                                        net = slim.conv2d(
                                            net, int(chnl / self.rate))
                                    else:
                                        net = net * prune[i]
                                        net = slim.conv2d(
                                            net, int(chnl / self.rate))

                                    net = slim.conv2d(net,
                                                      chnl,
                                                      activation_fn=None)
                                    net = net * prune[i]
                                    shortcut = shortcut + net
                                    shortcut = tf.nn.relu(shortcut)
                                    net = shortcut
                net = tf.reduce_mean(shortcut, [1, 2],
                                     keep_dims=False,
                                     name='pool')
                # ep['pool'] = net
                logit = slim.fully_connected(net,
                                             self.num_classes,
                                             activation_fn=None,
                                             normalizer_fn=None,
                                             scope='fc')
                return logit
Beispiel #24
0
def main():
    args = parser.parse_args()

    # Parse original info from the experiment root and add new ones.
    args_file = os.path.join(args.experiment_root, 'args.json')
    if not os.path.isfile(args_file):
        raise IOError('`args.json` not found in {}'.format(args_file))
    print('Loading args from {}.'.format(args_file))
    with open(args_file, 'r') as f:
        args_resumed = json.load(f)
    for key, value in args_resumed.items():
        if key not in args.__dict__:
            args.__dict__[key] = value

    # Load the config for the dataset.
    with open(args.dataset_config, 'r') as f:
        dataset_config = json.load(f)

    # Compute the label to color map
    id_to_rgb = np.asarray(dataset_config['rgb_colors'] + [(0, 0, 0)],
                           dtype=np.uint8)[:, ::-1]

    # If we map from original labels to train labels we have to invert this.
    original_to_train_mapping = dataset_config.get('original_to_train_mapping',
                                                   None)
    if original_to_train_mapping is None:
        # This results in an identity mapping.
        train_to_label_id = np.arange(len(id_to_rgb) - 1, dtype=np.uint8)
    else:
        train_to_label_id = np.arange(len(id_to_rgb) - 1, dtype=np.uint8)
        for label_id, label_train in enumerate(original_to_train_mapping):
            if label_train != -1:
                train_to_label_id[label_train] = label_id

    # Setup the input data.
    image_files, label_files = utils.load_dataset(args.eval_set,
                                                  args.rgb_input_root,
                                                  args.full_res_label_root)

    images = tf.data.Dataset.from_tensor_slices(image_files)
    labels = tf.data.Dataset.from_tensor_slices(label_files)
    dataset = tf.data.Dataset.zip((images, labels))

    dataset = dataset.map(lambda x, y: tf_utils.string_tuple_to_image_pair(
        x, y, original_to_train_mapping),
                          num_parallel_calls=args.loading_threads)
    dataset = tf.data.Dataset.zip((dataset, labels))

    # Scale the input images
    dataset = dataset.map(lambda x, y: (((x[0] - 128.0) / 128.0, x[1]), y))

    dataset = dataset.batch(args.batch_size)

    # Overlap producing and consuming for parallelism.
    dataset = dataset.prefetch(1)

    # Since we repeat the data infinitely, we only need a one-shot iterator.
    (image_batch, label_batch
     ), label_name_batch = dataset.make_one_shot_iterator().get_next()

    # Setup the network.
    model = import_module('networks.' + args.model_type)
    with tf.name_scope('model'):
        net = model.network(image_batch,
                            is_training=False,
                            **args.model_params)
        logits = slim.conv2d(
            net,
            len(dataset_config['class_names']), [3, 3],
            scope='output_conv',
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer(),
            biases_initializer=tf.zeros_initializer())
        predictions = tf.nn.softmax(logits)

    with tf.Session() as sess:
        # Determine the checkpoint location.
        checkpoint_loader = tf.train.Saver()
        if args.checkpoint_iteration == -1:
            # The default TF way to do this fails when moving folders.
            checkpoint = os.path.join(
                args.experiment_root,
                'checkpoint-{}'.format(args.train_iterations))
        else:
            checkpoint = os.path.join(
                args.experiment_root,
                'checkpoint-{}'.format(args.checkpoint_iteration))
        iteration = int(checkpoint.split('-')[-1])
        print('Restoring from checkpoint: {}'.format(checkpoint))
        checkpoint_loader.restore(sess, checkpoint)

        # Setup storage if needed.
        result_directory = os.path.join(args.experiment_root,
                                        'results-{}'.format(iteration))
        if (not os.path.isdir(result_directory)
                and args.save_predictions is not 'none'):
            os.makedirs(result_directory)

        # Initialize the evaluation.
        evaluation = confusion.Confusion(dataset_config['class_names'])

        # Loop over image batches.
        for start_idx in count(step=args.batch_size):
            try:
                print('\rEvaluating batch {}-{}/{}'.format(
                    start_idx, start_idx + args.batch_size, len(image_files)),
                      flush=True,
                      end='')
                preds_batch, gt_batch, gt_fn_batch = sess.run(
                    [predictions, label_batch, label_name_batch])
                for pred, gt, gt_fn in zip(preds_batch, gt_batch, gt_fn_batch):
                    # Compute the scores.
                    pred_full = np.argmax(cv2.resize(pred, gt.shape[:2][::-1]),
                                          -1)
                    evaluation.incremental_update(gt.squeeze(), pred_full)

                    # Possibly save result images.
                    if args.save_predictions == 'full':
                        pred_out = id_to_rgb[pred_full]

                    if args.save_predictions == 'out':
                        pred_out = id_to_rgb[np.argmax(pred, -1)]

                    if args.save_predictions == 'full_id':
                        pred_out = train_to_label_id[pred_full]

                    if args.save_predictions == 'out_id':
                        pred_out = train_to_label_id[np.argmax(pred, -1)]

                    if args.save_predictions != 'none':
                        out_filename = gt_fn.decode("utf-8").replace(
                            args.full_res_label_root, result_directory)
                        base_dir = os.path.dirname(out_filename)
                        if not os.path.isdir(base_dir):
                            os.makedirs(base_dir)
                        cv2.imwrite(out_filename, pred_out)

            except tf.errors.OutOfRangeError:
                print()  # Done!
                break

    # Print the evaluation.
    evaluation.print_confusion_matrix()

    # Save the results.
    result_file = os.path.join(args.experiment_root, 'results.json')
    try:
        with open(result_file, 'r') as f:
            result_log = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        result_log = {}

    result_log[str(iteration)] = {  # json keys cannot be integers.
        'confusion matrix' : evaluation.confusion_normalized_row.tolist(),
        'iou scores' : evaluation.iou_score.tolist(),
        'class scores' : evaluation.class_score.tolist(),
        'global score' : evaluation.global_score,
        'mean iou score' : evaluation.avg_iou_score,
        'mean class score' : evaluation.avg_score,
    }
    with open(result_file, 'w') as f:
        json.dump(result_log, f, ensure_ascii=False, indent=2, sort_keys=True)
Beispiel #25
0
  def _recognition_network(self, sampler=None, log_likelihood_func=None):
    """x values -> samples from Q and return log Q(h|x)."""
    samples = {}
    reuse = None if not self.run_recognition_network else True

    # Set defaults
    if sampler is None:
      sampler = self._random_sample

    if log_likelihood_func is None:
      log_likelihood_func = lambda sample, log_params: (
        U.binary_log_likelihood(sample['activation'], log_params))

    logQ = []


    if self.hparams.task in ['sbn', 'omni']:
      # Initialize the edge case
      samples[-1] = {'activation': self._x}
      if self.mean_xs is not None:
        samples[-1]['activation'] -= self.mean_xs  # center the input
      samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0

      with slim.arg_scope([slim.fully_connected],
                          weights_initializer=slim.variance_scaling_initializer(),
                          variables_collections=[Q_COLLECTION]):
        for i in xrange(self.hparams.n_layer):
          # Set up the input to the layer
          input = 2.0*samples[i-1]['activation'] - 1.0

          # Create the conditional distribution (output is the logits)
          h = self._create_transformation(input,
                                          n_output=self.hparams.n_hidden,
                                          reuse=reuse,
                                          scope_prefix='q_%d' % i)

          samples[i] = sampler(h, self.uniform_samples[i], i)
          logQ.append(log_likelihood_func(samples[i], h))

      self.run_recognition_network = True
      return logQ, samples
    elif self.hparams.task == 'sp':
      # Initialize the edge case
      samples[-1] = {'activation': tf.split(self._x,
                                            num_or_size_splits=2,
                                            axis=1)[0]}  # top half of digit
      if self.mean_xs is not None:
        samples[-1]['activation'] -= np.split(self.mean_xs, 2, 0)[0]  # center the input
      samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0

      with slim.arg_scope([slim.fully_connected],
                          weights_initializer=slim.variance_scaling_initializer(),
                          variables_collections=[Q_COLLECTION]):
        for i in xrange(self.hparams.n_layer):
          # Set up the input to the layer
          input = 2.0*samples[i-1]['activation'] - 1.0

          # Create the conditional distribution (output is the logits)
          h = self._create_transformation(input,
                                          n_output=self.hparams.n_hidden,
                                          reuse=reuse,
                                          scope_prefix='q_%d' % i)

          samples[i] = sampler(h, self.uniform_samples[i], i)
          logQ.append(log_likelihood_func(samples[i], h))

      self.run_recognition_network = True
      return logQ, samples
Beispiel #26
0
    def create_model(
            self,
            model_input,
            vocab_size,
            num_frames,
            iterations=None,
            add_batch_norm=None,
            sample_random_frames=None,
            cluster_size=None,
            hidden_size=None,
            is_training=True,
            expansion=2,
            groups=None,
            #mask=None,
            drop_rate=0.5,
            gating_reduction=None,
            **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = FLAGS.sample_random_frames if sample_random_frames is None else sample_random_frames
        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden_size = hidden_size or FLAGS.nextvlad_hidden_size
        groups = groups or FLAGS.groups
        gating_reduction = gating_reduction or FLAGS.gating_reduction

        num_frames_exp = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames_exp,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input,
                                                     num_frames_exp,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        #reshaped_input = tf.reshape(model_input, [-1, feature_size])
        #tf.summary.histogram("input_hist", reshaped_input)

        mask = tf.sequence_mask(num_frames, max_frames, dtype=tf.float32)

        input = slim.fully_connected(
            model_input,
            expansion * feature_size,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer())

        attention = slim.fully_connected(
            model_input,
            groups,
            activation_fn=tf.nn.sigmoid,
            weights_initializer=slim.variance_scaling_initializer())

        if mask is not None:
            attention = tf.multiply(attention, tf.expand_dims(mask, -1))
        attention = tf.reshape(attention, [-1, max_frames * groups, 1])
        tf.summary.histogram("sigmoid_attention", attention)
        reduce_size = expansion * feature_size // groups

        cluster_weights = tf.get_variable(
            "cluster_weights",
            [expansion * feature_size, groups * cluster_size],
            initializer=slim.variance_scaling_initializer())

        # tf.summary.histogram("cluster_weights", cluster_weights)
        reshaped_input = tf.reshape(input, [-1, expansion * feature_size])
        activation = tf.matmul(reshaped_input, cluster_weights)

        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="cluster_bn",
                                     fused=False)

        activation = tf.reshape(activation,
                                [-1, max_frames * groups, cluster_size])
        activation = tf.nn.softmax(activation, axis=-1)
        activation = tf.multiply(activation, attention)
        # tf.summary.histogram("cluster_output", activation)
        a_sum = tf.reduce_sum(activation, -2, keep_dims=True)

        cluster_weights2 = tf.get_variable(
            "cluster_weights2", [1, reduce_size, cluster_size],
            initializer=slim.variance_scaling_initializer())
        a = tf.multiply(a_sum, cluster_weights2)

        activation = tf.transpose(activation, perm=[0, 2, 1])

        reshaped_input = tf.reshape(input,
                                    [-1, max_frames * groups, reduce_size])
        vlad = tf.matmul(activation, reshaped_input)
        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.subtract(vlad, a)

        vlad = tf.nn.l2_normalize(vlad, 1)

        vlad = tf.reshape(vlad, [-1, cluster_size * reduce_size])
        vlad = slim.batch_norm(vlad,
                               center=True,
                               scale=True,
                               is_training=is_training,
                               scope="vlad_bn",
                               fused=False)

        if drop_rate > 0.:
            vlad = slim.dropout(vlad,
                                keep_prob=1. - drop_rate,
                                is_training=is_training,
                                scope="vlad_dropout")

        vlad_dim = vlad.get_shape().as_list()[1]
        print("VLAD dimension", vlad_dim)
        hidden_weights = tf.get_variable(
            "hidden_weights", [vlad_dim, hidden_size],
            initializer=slim.variance_scaling_initializer())

        activation = tf.matmul(vlad, hidden_weights)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden_bn",
                                     fused=False)

        activation = tf.nn.relu(activation, name='embedding1')

        gating_weights_1 = tf.get_variable(
            "gating_weights_1", [hidden_size, hidden_size // gating_reduction],
            initializer=slim.variance_scaling_initializer())

        gates = tf.matmul(activation, gating_weights_1)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=is_training,
                                activation_fn=slim.nn.relu,
                                scope="gating_bn")

        gating_weights_2 = tf.get_variable(
            "gating_weights_2", [hidden_size // gating_reduction, hidden_size],
            initializer=slim.variance_scaling_initializer())
        gates = tf.matmul(gates, gating_weights_2)

        gates = tf.sigmoid(gates)
        tf.summary.histogram("final_gates", gates)

        activation = tf.multiply(activation, gates, name="embedding2")

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Beispiel #27
0
def layer(val,
          num_outputs,
          name,
          act_fun=None,
          kernel_initializer=slim.variance_scaling_initializer(factor=1.0 /
                                                               3.0,
                                                               mode='FAN_IN',
                                                               uniform=True),
          layer_norm=False,
          batch_norm=False,
          phase=None,
          dropout=False,
          rate=None):
    """Create a fully-connected layer.

    Parameters
    ----------
    val : tf.Variable
        the input to the layer
    num_outputs : int
        number of outputs from the layer
    name : str
        the scope of the layer
    act_fun : tf.nn.* or None
        the activation function
    kernel_initializer : Any
        the initializing operation to the weights of the layer
    layer_norm : bool
        whether to enable layer normalization
    batch_norm : bool
        whether to enable batch normalization
    phase : tf.compat.v1.placeholder
        a placeholder that defines whether training is occurring for the batch
        normalization layer. Set to True in training and False in testing.
    dropout : bool
        whether to enable dropout
    rate : tf.compat.v1.placeholder
        the probability that each element is dropped if dropout is implemented

    Returns
    -------
    tf.Variable
        the output from the layer
    """
    val = tf.layers.dense(val,
                          num_outputs,
                          name=name,
                          kernel_initializer=kernel_initializer)

    if layer_norm:
        val = tf.contrib.layers.layer_norm(val, center=True, scale=True)

    if batch_norm:
        val = tf.contrib.layers.batch_norm(
            val,
            center=True,
            scale=True,
            is_training=phase,
            scope='bn_{}'.format(name),
        )

    if act_fun is not None:
        val = act_fun(val)

    if dropout:
        val = tf.nn.dropout(val, rate=rate)

    return val
Beispiel #28
0
    def _generator_network(self, samples, logQ, log_likelihood_func=None):
        '''Returns learning signal and function.

    This is the implementation for SBNs for the ELBO.

    Args:
      samples: dictionary of sampled latent variables
      logQ: list of log q(h_i) terms
      log_likelihood_func: function used to compute log probs for the latent
        variables

    Returns:
      learning_signal: the "reward" function
      function_term: part of the function that depends on the parameters
        and needs to have the gradient taken through
    '''
        reuse = None if not self.run_generator_network else True

        if self.hparams.task in ['sbn', 'omni']:
            if log_likelihood_func is None:
                log_likelihood_func = lambda sample, log_params: (
                    U.binary_log_likelihood(sample['activation'], log_params))

            logPPrior = log_likelihood_func(samples[self.hparams.n_layer - 1],
                                            tf.expand_dims(self.prior, 0))

            with slim.arg_scope(
                [slim.fully_connected],
                    weights_initializer=slim.variance_scaling_initializer(),
                    variables_collections=[P_COLLECTION]):

                for i in reversed(xrange(self.hparams.n_layer)):
                    if i == 0:
                        n_output = self.hparams.n_input
                    else:
                        n_output = self.hparams.n_hidden
                    input = 2.0 * samples[i]['activation'] - 1.0

                    h = self._create_transformation(input,
                                                    n_output,
                                                    reuse=reuse,
                                                    scope_prefix='p_%d' % i)

                    if i == 0:
                        # Assume output is binary
                        logP = U.binary_log_likelihood(self._x,
                                                       h + self.train_bias)
                    else:
                        logPPrior += log_likelihood_func(samples[i - 1], h)

            self.run_generator_network = True
            return logP + logPPrior - tf.add_n(logQ), logP + logPPrior
        elif self.hparams.task == 'sp':
            with slim.arg_scope(
                [slim.fully_connected],
                    weights_initializer=slim.variance_scaling_initializer(),
                    variables_collections=[P_COLLECTION]):
                n_output = int(self.hparams.n_input / 2)
                i = self.hparams.n_layer - 1  # use the last layer
                input = 2.0 * samples[i]['activation'] - 1.0

                h = self._create_transformation(input,
                                                n_output,
                                                reuse=reuse,
                                                scope_prefix='p_%d' % i)

                # Predict on the lower half of the image
                logP = U.binary_log_likelihood(
                    tf.split(self._x, num_or_size_splits=2, axis=1)[1],
                    h + np.split(self.train_bias, 2, 0)[1])

            self.run_generator_network = True
            return logP, logP
    def _build_graph(self):

        hidden1_size = self.NetVLADHiddenSize
        gating_reduction = 8
        model_input = tf.concat(
            [self.input_video_RGB_feature, self.input_video_Audio_feature],
            -1)  # [batch,max_frame,1024+128]
        mask = tf.sequence_mask(self.input_rgb_audio_true_frame,
                                300,
                                dtype=tf.float32)
        max_frames = model_input.get_shape().as_list()[1]
        video_nextvlad = NeXtVLAD(1024,
                                  max_frames,
                                  self.cluster_size,
                                  self.is_training,
                                  groups=self.groups,
                                  expansion=self.expansion)
        audio_nextvlad = NeXtVLAD(128,
                                  max_frames,
                                  self.cluster_size // 2,
                                  self.is_training,
                                  groups=self.groups // 2,
                                  expansion=self.expansion)

        with tf.variable_scope("video_VLAD"):
            vlad_video = video_nextvlad.forward(model_input[:, :, 0:1024],
                                                mask=mask)

        with tf.variable_scope("audio_VLAD"):
            vlad_audio = audio_nextvlad.forward(model_input[:, :, 1024:],
                                                mask=mask)

        vlad = tf.concat([vlad_video, vlad_audio], 1)

        vlad = slim.dropout(vlad,
                            keep_prob=self.dropout_keep_prob,
                            is_training=self.is_training,
                            scope="vlad_dropout")

        vlad_dim = vlad.get_shape().as_list()[1]
        print("VLAD dimension", vlad_dim)
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [vlad_dim, hidden1_size],
            initializer=slim.variance_scaling_initializer())

        activation = tf.matmul(vlad, hidden1_weights)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=self.is_training,
                                     scope="hidden1_bn",
                                     fused=False)

        gating_weights_1 = tf.get_variable(
            "gating_weights_1",
            [hidden1_size, hidden1_size // gating_reduction],
            initializer=slim.variance_scaling_initializer())

        gates = tf.matmul(activation, gating_weights_1)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=self.is_training,
                                activation_fn=slim.nn.relu,
                                scope="gating_bn")

        gating_weights_2 = tf.get_variable(
            "gating_weights_2",
            [hidden1_size // gating_reduction, hidden1_size],
            initializer=slim.variance_scaling_initializer())
        gates = tf.matmul(gates, gating_weights_2)

        gates = tf.sigmoid(gates)
        tf.summary.histogram("final_gates", gates)

        activation = tf.multiply(activation, gates)

        l2_penalty = 1e-8

        with tf.variable_scope("output_cate1"):
            self.cate1_logits = slim.fully_connected(
                activation,
                len(self.youtu_8m_cate1_dict),
                activation_fn=None,
                weights_regularizer=slim.l2_regularizer(l2_penalty),
                biases_regularizer=slim.l2_regularizer(l2_penalty),
                weights_initializer=slim.variance_scaling_initializer())
            self.cate1_probs = tf.nn.sigmoid(self.cate1_logits)

            self.cate1_top5_probs_value, self.cate1_top5_probs_index = tf.nn.top_k(
                self.cate1_probs, 5)

            # self.total_loss=self.calculate_loss(predictions=self.logits,labels=self.input_cate2_multilabel)

            self.cate1_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=self.input_cate1_multilabel,
                logits=self.cate1_logits,
                name="cate2_cross_loss")

            self.mean_cate1_loss = tf.reduce_mean(self.cate1_loss)

        self.cate1_embeddings = tf.cast(self.youtu_8m_cate1_embedding,
                                        dtype=tf.float32)

        with tf.variable_scope('attention'):
            self.U = tf.tanh(
                tc.layers.fully_connected(self.cate1_embeddings,
                                          num_outputs=512,
                                          activation_fn=None,
                                          biases_initializer=None) +
                tc.layers.fully_connected(tf.expand_dims(activation, 1),
                                          num_outputs=512,
                                          activation_fn=None))
            self.first_logits = tc.layers.fully_connected(self.U,
                                                          num_outputs=1,
                                                          activation_fn=None)
            self.first_scores = tf.nn.softmax(self.first_logits, 1)  # [batch,]

            self.cate1_embeddings_attention = tf.reduce_sum(
                self.cate1_embeddings * self.first_scores,
                axis=1)  # [batch,max_len,2h]

        with tf.variable_scope("output_cate2"):
            self.cate2_logits = slim.fully_connected(
                tf.concat([activation, self.cate1_embeddings_attention], -1),
                3862,
                activation_fn=None,
                weights_regularizer=slim.l2_regularizer(l2_penalty),
                biases_regularizer=slim.l2_regularizer(l2_penalty),
                weights_initializer=slim.variance_scaling_initializer())
            self.cate2_probs = tf.nn.sigmoid(self.cate2_logits)

            self.cate2_top20_probs_value, self.cate2_top20_probs_index = tf.nn.top_k(
                self.cate2_probs, 20)
            self.cate2_top40_probs_value, self.cate2_top40_probs_index = tf.nn.top_k(
                self.cate2_probs, 40)

            # self.total_loss=self.calculate_loss(predictions=self.logits,labels=self.input_cate2_multilabel)

            self.cate2_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=self.input_cate2_multilabel,
                logits=self.cate2_logits,
                name="cate2_cross_loss")
            self.mean_cate2_loss = tf.reduce_mean(self.cate2_loss)

        self.total_loss = self.mean_cate1_loss + 2 * self.mean_cate2_loss
    def build_bisenet(self, reuse=False):
        """
        Builds the BiSeNet model.

        Arguments:
          reuse: Reuse variable or not

        Returns:
          BiSeNet model
        """

        ### The spatial path
        ### The number of feature maps for each convolution is not specified in the paper
        ### It was chosen here to be equal to the number of feature maps of a classification
        ### model at each corresponding stage
        batch_norm_params = self.model_config['batch_norm_params']
        init_method = self.model_config['conv_config']['init_method']

        if init_method == 'kaiming_normal':
            initializer = slim.variance_scaling_initializer(factor=2.0,
                                                            mode='FAN_IN',
                                                            uniform=False)
        else:
            initializer = slim.xavier_initializer()

        with tf.variable_scope('spatial_net', reuse=reuse):
            with slim.arg_scope([slim.conv2d],
                                biases_initializer=None,
                                weights_initializer=initializer):
                with slim.arg_scope([slim.batch_norm],
                                    is_training=self.is_training(),
                                    **batch_norm_params):
                    spatial_net = ConvBlock(self.images,
                                            n_filters=64,
                                            kernel_size=[7, 7],
                                            strides=2)
                    spatial_net = ConvBlock(spatial_net,
                                            n_filters=64,
                                            kernel_size=[3, 3],
                                            strides=2)
                    spatial_net = ConvBlock(spatial_net,
                                            n_filters=64,
                                            kernel_size=[3, 3],
                                            strides=2)
                    spatial_net = ConvBlock(spatial_net,
                                            n_filters=128,
                                            kernel_size=[1, 1])

        frontend_config = self.model_config['frontend_config']
        ### Context path
        logits, end_points, frontend_scope, init_fn = frontend_builder.build_frontend(
            self.images, frontend_config, self.is_training(), reuse)

        ### Combining the paths
        with tf.variable_scope('combine_path', reuse=reuse):
            with slim.arg_scope([slim.conv2d],
                                biases_initializer=None,
                                weights_initializer=initializer):
                with slim.arg_scope([slim.batch_norm],
                                    is_training=self.is_training(),
                                    **batch_norm_params):
                    # tail part
                    size = tf.shape(end_points['pool5'])[1:3]
                    print('111111111111111', end_points['pool5'])
                    exit()
                    global_context = tf.reduce_mean(end_points['pool5'],
                                                    [1, 2],
                                                    keep_dims=True)
                    global_context = slim.conv2d(global_context,
                                                 128,
                                                 1, [1, 1],
                                                 activation_fn=None)
                    global_context = tf.nn.relu(
                        slim.batch_norm(global_context, fused=True))
                    global_context = tf.image.resize_bilinear(global_context,
                                                              size=size)

                    net_5 = AttentionRefinementModule(end_points['pool5'],
                                                      n_filters=128)
                    net_4 = AttentionRefinementModule(end_points['pool4'],
                                                      n_filters=128)

                    net_5 = tf.add(net_5, global_context)
                    net_5 = Upsampling(net_5, scale=2)
                    net_5 = ConvBlock(net_5, n_filters=128, kernel_size=[3, 3])
                    net_4 = tf.add(net_4, net_5)
                    net_4 = Upsampling(net_4, scale=2)
                    net_4 = ConvBlock(net_4, n_filters=128, kernel_size=[3, 3])

                    context_net = net_4

                    net = FeatureFusionModule(input_1=spatial_net,
                                              input_2=context_net,
                                              n_filters=256)
                    net_5 = ConvBlock(net_5, n_filters=128, kernel_size=[3, 3])
                    net_4 = ConvBlock(net_4, n_filters=128, kernel_size=[3, 3])
                    net = ConvBlock(net, n_filters=64, kernel_size=[3, 3])

                    # Upsampling + dilation or only Upsampling
                    net = Upsampling(net, scale=2)
                    net = slim.conv2d(net,
                                      64, [3, 3],
                                      rate=2,
                                      activation_fn=tf.nn.relu,
                                      biases_initializer=None,
                                      normalizer_fn=slim.batch_norm)

                    net = slim.conv2d(net,
                                      self.num_classes, [1, 1],
                                      activation_fn=None,
                                      scope='logits')
                    self.net = Upsampling(net, 4)

                    # net = slim.conv2d(net, self.num_classes, [1, 1], activation_fn=None, scope='logits')
                    # self.net = Upsampling(net, scale=8)

                    if self.mode in ['train', 'validation', 'test']:
                        sup1 = slim.conv2d(net_5,
                                           self.num_classes, [1, 1],
                                           activation_fn=None,
                                           scope='supl1')
                        sup2 = slim.conv2d(net_4,
                                           self.num_classes, [1, 1],
                                           activation_fn=None,
                                           scope='supl2')
                        self.sup1 = Upsampling(sup1, scale=16)
                        self.sup2 = Upsampling(sup2, scale=8)
                        self.init_fn = init_fn
    def forward(self, input, mask=None):
        input = slim.fully_connected(
            input,
            self.expansion * self.feature_size,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer())

        attention = slim.fully_connected(
            input,
            self.groups,
            activation_fn=tf.nn.sigmoid,
            weights_initializer=slim.variance_scaling_initializer())
        if mask is not None:
            attention = tf.multiply(attention, tf.expand_dims(mask, -1))
        attention = tf.reshape(attention,
                               [-1, self.max_frames * self.groups, 1])
        tf.summary.histogram("sigmoid_attention", attention)
        feature_size = self.expansion * self.feature_size // self.groups

        cluster_weights = tf.get_variable(
            "cluster_weights", [
                self.expansion * self.feature_size,
                self.groups * self.cluster_size
            ],
            initializer=slim.variance_scaling_initializer())

        # tf.summary.histogram("cluster_weights", cluster_weights)
        reshaped_input = tf.reshape(input,
                                    [-1, self.expansion * self.feature_size])
        activation = tf.matmul(reshaped_input, cluster_weights)

        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=self.is_training,
                                     scope="cluster_bn",
                                     fused=False)

        activation = tf.reshape(
            activation, [-1, self.max_frames * self.groups, self.cluster_size])
        activation = tf.nn.softmax(activation, axis=-1)
        activation = tf.multiply(activation, attention)
        # tf.summary.histogram("cluster_output", activation)
        a_sum = tf.reduce_sum(activation, -2, keepdims=True)

        cluster_weights2 = tf.get_variable(
            "cluster_weights2", [1, feature_size, self.cluster_size],
            initializer=slim.variance_scaling_initializer())
        a = tf.multiply(a_sum, cluster_weights2)

        activation = tf.transpose(activation, perm=[0, 2, 1])

        reshaped_input = tf.reshape(
            input, [-1, self.max_frames * self.groups, feature_size])
        vlad = tf.matmul(activation, reshaped_input)
        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.subtract(vlad, a)

        vlad = tf.nn.l2_normalize(vlad, 1)

        vlad = tf.reshape(vlad, [-1, self.cluster_size * feature_size])
        vlad = slim.batch_norm(vlad,
                               center=True,
                               scale=True,
                               is_training=self.is_training,
                               scope="vlad_bn",
                               fused=False)

        return vlad
Beispiel #32
0
def head(endpoints, embedding_dim, is_training):
    batch_norm_params = {
            'decay': 0.9,
            'epsilon': 1e-5,
            'scale': True,
            'updates_collections': tf.GraphKeys.UPDATE_OPS,
            'fused': None,
            }
    with slim.arg_scope(
            [slim.conv2d],
            weights_regularizer=slim.l2_regularizer(0.0),
            weights_initializer=slim.variance_scaling_initializer(),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            masked_maps = []
            projection_conv = slim.conv2d(endpoints['resnet_v2_50/block4'], 512, [1, 1], scope='projection_conv')
            attention_block4_conv1 = slim.conv2d(endpoints['resnet_v2_50/block4'], 64, [1, 1], scope='attention_block4_conv1')
            attention_block4_conv2 = slim.conv2d(attention_block4_conv1, 1, [1, 1], scope='attention_block4_conv2')
            attention_block4_mask = tf.sigmoid(attention_block4_conv2)
            masked_maps.append(projection_conv * attention_block4_mask)

            attention_block3_conv1 = slim.conv2d(endpoints['resnet_v2_50/block3'], 64, [1, 1], scope='attention_block3_conv1')
            attention_block3_conv2 = slim.conv2d(attention_block3_conv1, 1, [1, 1], scope='attention_block3_conv2')
            attention_block3_mask = tf.sigmoid(attention_block3_conv2)
            masked_maps.append(projection_conv * attention_block3_mask)

            attention_block2_conv1 = slim.conv2d(endpoints['resnet_v2_50/block2'], 64, [1, 1], scope='attention_block2_conv1')
            attention_block2_conv2 = slim.conv2d(attention_block2_conv1, 1, [1, 1], scope='attention_block2_conv2')
            attention_block2_pool = slim.max_pool2d(attention_block2_conv2, [2, 2], scope='attention_block2_pool')
            attention_block2_mask = tf.sigmoid(attention_block2_pool)
            masked_maps.append(projection_conv * attention_block2_mask)

            attention_block1_conv1 = slim.conv2d(endpoints['resnet_v2_50/block1'], 64, [1, 1], scope='attention_block1_conv1')
            attention_block1_pool1 = slim.max_pool2d(attention_block1_conv1, [2, 2], scope='attention_block1_pool1')
            attention_block1_conv2 = slim.conv2d(attention_block1_pool1, 1, [1, 1], scope='attention_block1_conv2')
            attention_block1_pool2 = slim.max_pool2d(attention_block1_conv2, [2, 2], scope='attention_block2_pool2')
            attention_block1_mask = tf.sigmoid(attention_block1_pool2)
            masked_maps.append(projection_conv * attention_block1_mask)

    endpoints['attention_mask_block1'] = attention_block1_mask
    endpoints['attention_mask_block2'] = attention_block2_mask
    endpoints['attention_mask_block3'] = attention_block3_mask
    endpoints['attention_mask_block4'] = attention_block4_mask
    _masked = tf.concat(masked_maps, 3)

    endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean(
            _masked, [1, 2], name='_pool5', keep_dims=False)

    endpoints['head_output'] = slim.fully_connected(
        endpoints['model_output'], 1024, normalizer_fn=slim.batch_norm,
        normalizer_params={
            'decay': 0.9,
            'epsilon': 1e-5,
            'scale': True,
            'is_training': is_training,
            'updates_collections': tf.GraphKeys.UPDATE_OPS,
        })

    endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected(
        endpoints['head_output'], embedding_dim, activation_fn=None,
        weights_initializer=tf.orthogonal_initializer(), scope='emb')

    return endpoints
Beispiel #33
0
def conv_layer(val,
               filters,
               kernel_size,
               strides,
               name,
               act_fun=None,
               kernel_initializer=slim.variance_scaling_initializer(
                   factor=1.0 / 3.0, mode='FAN_IN', uniform=True),
               layer_norm=False,
               batch_norm=False,
               phase=None,
               dropout=False,
               rate=None):
    """Create a convolutional layer.

    Parameters
    ----------
    val : tf.Variable
        the input to the layer
    filters : int
        the number of channels in the convolutional kernel
    kernel_size : int or list of int
        the height and width of the convolutional filter
    strides : int or list of int
        the strides in each direction of convolution
    name : str
        the scope of the layer
    act_fun : tf.nn.* or None
        the activation function
    kernel_initializer : Any
        the initializing operation to the weights of the layer
    layer_norm : bool
        whether to enable layer normalization
    batch_norm : bool
        whether to enable batch normalization
    phase : tf.compat.v1.placeholder
        a placeholder that defines whether training is occurring for the batch
        normalization layer. Set to True in training and False in testing.
    dropout : bool
        whether to enable dropout
    rate : tf.compat.v1.placeholder
        the probability that each element is dropped if dropout is implemented

    Returns
    -------
    tf.Variable
        the output from the layer
    """
    val = tf.layers.conv2d(val,
                           filters,
                           kernel_size,
                           strides=strides,
                           padding='same',
                           name=name,
                           kernel_initializer=kernel_initializer)

    if layer_norm:
        val = tf.contrib.layers.layer_norm(val, center=True, scale=True)

    if batch_norm:
        val = tf.contrib.layers.batch_norm(
            val,
            center=True,
            scale=True,
            is_training=phase,
            scope='bn_{}'.format(name),
        )

    if act_fun is not None:
        val = act_fun(val)

    if dropout:
        val = tf.nn.dropout(val, rate=rate)

    return val
Beispiel #34
0
  def _generator_network(self, samples, logQ, log_likelihood_func=None):
    '''Returns learning signal and function.

    This is the implementation for SBNs for the ELBO.

    Args:
      samples: dictionary of sampled latent variables
      logQ: list of log q(h_i) terms
      log_likelihood_func: function used to compute log probs for the latent
        variables

    Returns:
      learning_signal: the "reward" function
      function_term: part of the function that depends on the parameters
        and needs to have the gradient taken through
    '''
    reuse=None if not self.run_generator_network else True

    if self.hparams.task in ['sbn', 'omni']:
      if log_likelihood_func is None:
        log_likelihood_func = lambda sample, log_params: (
          U.binary_log_likelihood(sample['activation'], log_params))

      logPPrior = log_likelihood_func(
          samples[self.hparams.n_layer-1],
          tf.expand_dims(self.prior, 0))

      with slim.arg_scope([slim.fully_connected],
                          weights_initializer=slim.variance_scaling_initializer(),
                          variables_collections=[P_COLLECTION]):

        for i in reversed(xrange(self.hparams.n_layer)):
          if i == 0:
            n_output = self.hparams.n_input
          else:
            n_output = self.hparams.n_hidden
          input = 2.0*samples[i]['activation']-1.0

          h = self._create_transformation(input,
                                          n_output,
                                          reuse=reuse,
                                          scope_prefix='p_%d' % i)

          if i == 0:
            # Assume output is binary
            logP = U.binary_log_likelihood(self._x, h + self.train_bias)
          else:
            logPPrior += log_likelihood_func(samples[i-1], h)

      self.run_generator_network = True
      return logP + logPPrior - tf.add_n(logQ), logP + logPPrior
    elif self.hparams.task == 'sp':
      with slim.arg_scope([slim.fully_connected],
                          weights_initializer=slim.variance_scaling_initializer(),
                          variables_collections=[P_COLLECTION]):
        n_output = int(self.hparams.n_input/2)
        i = self.hparams.n_layer - 1  # use the last layer
        input = 2.0*samples[i]['activation']-1.0

        h = self._create_transformation(input,
                                        n_output,
                                        reuse=reuse,
                                        scope_prefix='p_%d' % i)

        # Predict on the lower half of the image
        logP = U.binary_log_likelihood(tf.split(self._x,
                                              num_or_size_splits=2,
                                              axis=1)[1],
                                     h + np.split(self.train_bias, 2, 0)[1])

      self.run_generator_network = True
      return logP, logP
Beispiel #35
0
def atari_network(num_actions,
                  num_atoms,
                  support,
                  network_type,
                  state,
                  representation_layer=10):
    """The convolutional network used to compute agent's Q-value distributions.

  Args:
    num_actions: int, number of actions.
    num_atoms: int, the number of buckets of the value function distribution.
    support: tf.linspace, the support of the Q-value distribution.
    network_type: namedtuple, collection of expected values to return.
    state: `tf.Tensor`, contains the agent's current state.
    representation_layer: int, the layer which will be used as the
      representation for computing the bisimulation distances. Defaults to
      a high value, which defaults to the penultimate layer.

  Returns:
    net: _network_type object containing the tensors output by the network.
  """
    weights_initializer = contrib_slim.variance_scaling_initializer(
        factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)

    curr_layer = 1
    net = tf.cast(state, tf.float32)
    net = tf.div(net, 255.)
    representation = None
    if representation_layer <= curr_layer:
        representation = contrib_slim.flatten(net)
    net = contrib_slim.conv2d(net,
                              32, [8, 8],
                              stride=4,
                              weights_initializer=weights_initializer,
                              trainable=False)
    curr_layer += 1
    if representation is None and representation_layer <= curr_layer:
        representation = contrib_slim.flatten(net)
    net = contrib_slim.conv2d(net,
                              64, [4, 4],
                              stride=2,
                              weights_initializer=weights_initializer,
                              trainable=False)
    curr_layer += 1
    if representation is None and representation_layer <= curr_layer:
        representation = contrib_slim.flatten(net)
    net = contrib_slim.conv2d(net,
                              64, [3, 3],
                              stride=1,
                              weights_initializer=weights_initializer,
                              trainable=False)
    net = contrib_slim.flatten(net)
    curr_layer += 1
    if representation is None and representation_layer <= curr_layer:
        representation = net
    net = contrib_slim.fully_connected(net,
                                       512,
                                       weights_initializer=weights_initializer,
                                       trainable=False)
    curr_layer += 1
    if representation is None:
        representation = net
    net = contrib_slim.fully_connected(net,
                                       num_actions * num_atoms,
                                       activation_fn=None,
                                       weights_initializer=weights_initializer,
                                       trainable=False)

    logits = tf.reshape(net, [-1, num_actions, num_atoms])
    probabilities = contrib_layers.softmax(logits)
    q_values = tf.reduce_sum(support * probabilities, axis=2)
    return network_type(q_values, logits, probabilities, representation)
Beispiel #36
0
def construct_network(frame_input, root_tags, reuse, is_training, title_input,
                      desc_input, ocr_input, cate_input):
    """
    :param frame_input:
    :param tags_input:
    :param reuse:
    :param is_training:
    :return:
    """
    with tf.variable_scope('text', reuse=reuse) as scope:
        with tf.device("/cpu:0"), tf.variable_scope('dict'):
            # word_embedding = tf.get_variable('initW', [vocab_size, embed_size], trainable=True)
            title_raw = tf.nn.embedding_lookup(word_embed, title_input)
            desc_raw = tf.nn.embedding_lookup(word_embed, desc_input)
            ocr_raw = tf.nn.embedding_lookup(word_embed, ocr_input)
            cate_raw = tf.nn.embedding_lookup(word_embed, cate_input)

        with tf.variable_scope("conv"):

            def txt_conv(t_input, d_input, conv_w, name):
                text = tf.concat([t_input, d_input], axis=1)
                conv = tf.layers.conv1d(text,
                                        filters=num_filter,
                                        kernel_size=conv_w,
                                        name=name)
                conv = slim.batch_norm(conv,
                                       decay=0.9997,
                                       epsilon=0.001,
                                       is_training=is_training)
                conv = tf.reduce_max(conv,
                                     reduction_indices=[1],
                                     name='global_pool_title_desc')
                return conv

            rep_2 = txt_conv(title_raw, desc_raw, 2, 'conv2')
            rep_3 = txt_conv(title_raw, desc_raw, 3, 'conv3')
            rep_4 = txt_conv(title_raw, desc_raw, 4, 'conv4')
            rep_5 = txt_conv(title_raw, desc_raw, 5, 'conv5')

            rep_cate_2 = txt_conv(cate_raw, ocr_raw, 2, 'conv2_1')
            rep_cate_3 = txt_conv(cate_raw, ocr_raw, 2, 'conv3_1')
            rep_cate_4 = txt_conv(cate_raw, ocr_raw, 2, 'conv4_1')
            rep_cate_5 = txt_conv(cate_raw, ocr_raw, 2, 'conv5_1')

            rep = tf.concat([rep_2, rep_3, rep_4, rep_5], 1)
            rep_cate = tf.concat(
                [rep_cate_2, rep_cate_3, rep_cate_4, rep_cate_5], 1)

            text_logits_1 = tf.layers.dense(rep, 256)  # 512
            text_logits_2 = tf.layers.dense(rep_cate, 256)
            text_logits = tf.concat([text_logits_1, text_logits_2], 1)

        with tf.variable_scope("transformer"):
            with tf.variable_scope('preprocess', reuse=reuse) as scope:
                frame_position_embeddings = tf.get_variable(
                    name='frame_position_embedding',
                    shape=[text_length, ATTENTION_EMBED_DIM],
                    initializer=tf.truncated_normal_initializer(stddev=0.02))
                frame_parts = tf.layers.conv1d(tf.concat([title_raw, desc_raw],
                                                         axis=1),
                                               filters=ATTENTION_EMBED_DIM,
                                               kernel_size=1,
                                               name='frame_feat_squeeze')
                frame_parts = slim.batch_norm(frame_parts,
                                              decay=0.9997,
                                              epsilon=0.001,
                                              is_training=is_training)
                frame_parts += frame_position_embeddings

            intermediate_size = 512
            hidden_size = ATTENTION_EMBED_DIM
            initializer_range = 0.02
            hidden_dropout_prob = 0.2

            prev_output = frame_parts
            for layer_idx in range(FLAGS.attention_layer_num):
                with tf.variable_scope("layer_%d" % layer_idx):
                    layer_input = prev_output

                    with tf.variable_scope("attention"):
                        with tf.variable_scope("self"):
                            attention_head = attention_layer(
                                from_tensor=layer_input,
                                to_tensor=layer_input,
                                attention_mask=None,
                                num_attention_heads=4,
                                size_per_head=64,
                                attention_probs_dropout_prob=0.2,
                                initializer_range=0.02,
                                do_return_2d_tensor=False,
                                batch_size=batch_size,
                                from_seq_length=text_length,
                                to_seq_length=text_length)

                            attention_output = attention_head

                        # Run a linear projection of `hidden_size` then add a residual
                        # with `layer_input`.
                        with tf.variable_scope("output"):
                            attention_output = tf.layers.dense(
                                attention_output,
                                hidden_size,
                                kernel_initializer=create_initializer(
                                    initializer_range))
                            attention_output = dropout(attention_output,
                                                       hidden_dropout_prob)
                            attention_output = slim.batch_norm(
                                attention_output + layer_input,
                                decay=0.9997,
                                epsilon=0.001,
                                is_training=is_training)

                    # The activation is only applied to the "intermediate" hidden layer.
                    with tf.variable_scope("intermediate"):
                        intermediate_output = tf.layers.dense(
                            attention_output,
                            intermediate_size,
                            activation=tf.nn.relu,
                            kernel_initializer=create_initializer(
                                initializer_range))

                    # Down-project back to `hidden_size` then add the residual.
                    with tf.variable_scope("output"):
                        layer_output = tf.layers.dense(
                            intermediate_output,
                            hidden_size,
                            kernel_initializer=create_initializer(
                                initializer_range))
                        layer_output = dropout(layer_output,
                                               hidden_dropout_prob)
                        layer_output = slim.batch_norm(layer_output +
                                                       attention_output,
                                                       decay=0.9997,
                                                       epsilon=0.001,
                                                       is_training=is_training)
                        prev_output = layer_output

            attention_final = tf.reduce_max(prev_output, [1],
                                            keep_dims=False,
                                            name='reduce_max')  # 256

    with tf.variable_scope('NeXtVLAD', reuse=reuse) as scope:
        # re_d = 512
        # frame_input_1 = tf.layers.dense(frame_input, re_d, activation=tf.nn.relu, name='re_d')

        video_nextvlad = NeXtVLAD(FRAME_FEAT_DIM,
                                  FRAME_FEAT_LEN,
                                  FLAGS.nextvlad_cluster_size,
                                  is_training,
                                  groups=FLAGS.groups,
                                  expansion=FLAGS.expansion)

        vlad = video_nextvlad.forward(frame_input, mask=None)

        vlad = slim.dropout(vlad,
                            keep_prob=1. - FLAGS.vlad_drop_rate,
                            is_training=is_training,
                            scope="vlad_dropout")

        # SE context gating
        vlad_dim = vlad.get_shape().as_list()[1]
        # print("VLAD dimension", vlad_dim)
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [vlad_dim, FLAGS.nextvlad_hidden_size],
            initializer=slim.variance_scaling_initializer())

        activation = tf.matmul(vlad, hidden1_weights)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden1_bn",
                                     fused=False)

        # activation = tf.nn.relu(activation)

        gating_weights_1 = tf.get_variable(
            "gating_weights_1", [
                FLAGS.nextvlad_hidden_size,
                FLAGS.nextvlad_hidden_size // FLAGS.gating_reduction
            ],
            initializer=slim.variance_scaling_initializer())

        gates = tf.matmul(activation, gating_weights_1)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=is_training,
                                activation_fn=slim.nn.relu,
                                scope="gating_bn")

        gating_weights_2 = tf.get_variable(
            "gating_weights_2", [
                FLAGS.nextvlad_hidden_size // FLAGS.gating_reduction,
                FLAGS.nextvlad_hidden_size
            ],
            initializer=slim.variance_scaling_initializer())
        gates = tf.matmul(gates, gating_weights_2)

        gates = tf.sigmoid(gates)

        vlad_activation = tf.multiply(activation, gates)
        # vlad_activation = vlad

    with tf.variable_scope('frame', reuse=reuse) as scope:
        # layer 1 (batch * 200 * 1024)
        nets_frame = tf.layers.conv1d(frame_input,
                                      filters=1024,
                                      kernel_size=3,
                                      name='conv1d_1')
        nets_frame = slim.batch_norm(nets_frame,
                                     decay=0.9997,
                                     epsilon=0.001,
                                     is_training=is_training)
        nets_frame = tf.nn.relu(nets_frame)
        nets_frame = tf.layers.max_pooling1d(nets_frame,
                                             pool_size=2,
                                             strides=2,
                                             name='pool1d_1')

        # layer 2
        nets_frame = tf.layers.conv1d(nets_frame,
                                      filters=256,
                                      kernel_size=5,
                                      name='conv1d_2')
        nets_frame = slim.batch_norm(nets_frame,
                                     decay=0.9997,
                                     epsilon=0.001,
                                     is_training=is_training)
        nets_frame = tf.nn.relu(nets_frame)
        # layer 3
        nets_frame = tf.layers.conv1d(nets_frame,
                                      filters=256,
                                      kernel_size=5,
                                      name='conv1d_3')
        nets_frame = slim.batch_norm(nets_frame,
                                     decay=0.9997,
                                     epsilon=0.001,
                                     is_training=is_training)
        nets_frame = tf.nn.relu(nets_frame)  # 91 * 256
        # max pooling layer
        nets_frame = tf.layers.max_pooling1d(nets_frame,
                                             pool_size=4,
                                             strides=4,
                                             name='pool1d_2')

        # test flat
        nets_frame = tf.layers.flatten(nets_frame)  # 5632 = 22 * 256
        # nets_frame = tf.reduce_max(nets_frame, reduction_indices=[1], name='max_pool')

        fc_frame = tf.layers.dense(nets_frame, 512, name='fc1')  # 512
        # fc_frame = tf.nn.l2_normalize(fc_frame, dim=1)

    with tf.variable_scope('predict', reuse=reuse) as scope:
        video_vector = tf.concat(
            [fc_frame, text_logits, attention_final, vlad_activation],
            axis=1)  # 1280

        video_vector = tf.layers.dropout(video_vector,
                                         drop_rate,
                                         training=is_training)
        video_vector = tf.nn.relu(video_vector)

        video_vector = tf.layers.dense(video_vector, 512, name='dense_layer_3')
        total_vector = slim.batch_norm(video_vector,
                                       decay=0.9997,
                                       epsilon=0.001,
                                       is_training=is_training)
        tf.check_numerics(video_vector, 'video_vector is inf or nan')
        # -- root predict
        with tf.variable_scope('root_se_cg', reuse=reuse) as scope:
            root_vector = se_context_gate(total_vector,
                                          is_training=is_training,
                                          se_hidden_size=512)
        predict_root = tf.layers.dense(root_vector, TAG_NUM, name='pred_root')
        predict_root_label = tf.argmax(predict_root, dimension=-1)
        predict_root_confidence = tf.nn.softmax(predict_root, name='conf_root')
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=predict_root, labels=root_tags)
        loss_root = tf.reduce_mean(cross_entropy)

        L2_frame = tf.Variable(initial_value=0.,
                               trainable=False,
                               dtype=tf.float32)
        L2_text = tf.Variable(initial_value=0.,
                              trainable=False,
                              dtype=tf.float32)
        L2_w2v = tf.Variable(initial_value=0.,
                             trainable=False,
                             dtype=tf.float32)
        for w in tl.layers.get_variables_with_name('frame', True, True):
            L2_frame += tf.contrib.layers.l2_regularizer(1.0)(w)

        for w in tl.layers.get_variables_with_name('predict', True, True):
            L2_frame += tf.contrib.layers.l2_regularizer(1.0)(w)

        for w in tl.layers.get_variables_with_name('NeXtVLAD', True, True):
            L2_frame += tf.contrib.layers.l2_regularizer(1.0)(w)

        for w in tl.layers.get_variables_with_name('text', True, True):
            L2_text += tf.contrib.layers.l2_regularizer(1.0)(w)

        if FLAGS.train_w2v:
            for w in tl.layers.get_variables_with_name('initW', True, True):
                L2_w2v += tf.contrib.layers.l2_regularizer(1.0)(w)

    cost = FLAGS.root_weight * loss_root + FLAGS.frame_weight * L2_frame + \
           FLAGS.text_weight * L2_text + FLAGS.w2v_weight * L2_w2v
    result = dict()
    result['loss_root'] = loss_root
    result['cost'] = cost
    result['predict_root'] = predict_root
    result['predict_label_root'] = predict_root_label
    result['confidence_root'] = predict_root_confidence
    result['L2_frame'] = L2_frame
    result['L2_text'] = L2_text
    result['L2_w2v'] = L2_w2v
    return result
def head(endpoints, embedding_dim, is_training):

    batch_norm_params = {
        'decay': 0.9,
        'epsilon': 1e-5,
        'scale': True,
        'updates_collections': tf.GraphKeys.UPDATE_OPS,
        'fused': None,
    }
    with slim.arg_scope(
        [slim.conv2d],
            weights_regularizer=slim.l2_regularizer(0.0),
            weights_initializer=slim.variance_scaling_initializer(),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            masks = []
            masked_maps = []
            for i in range(head_num):
                attention_branch_mask = attention_branch(
                    endpoints['resnet_v2_50/block4'], i)
                masks.append(attention_branch_mask)
                masked_map = (1 + attention_branch_mask
                              ) * endpoints['resnet_v2_50/block4']
                endpoints['attention_map{}'.format(i)] = masked_map
                masked_maps.append(masked_map)
            endpoints['attention_masks'] = masks

            mbd_collect = []
            for i in range(head_num):
                for j in range(i + 1, head_num):
                    js_div = js_divergence(masks[i], masks[j],
                                           'constraint_{}{}'.format(i, j))
                    mbd_collect.append(js_div)
            endpoints['MBD_Constraint'] = tf.add_n(mbd_collect,
                                                   name='MBD_Constraint')

    _masked = tf.concat(masked_maps, axis=3, name='concat_mask')
    endpoints['masked'] = _masked

    endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean(
        _masked, [1, 2], name='_pool5', keep_dims=False)

    endpoints['head_output'] = slim.fully_connected(
        endpoints['model_output'],
        1024,
        normalizer_fn=slim.batch_norm,
        normalizer_params={
            'decay': 0.9,
            'epsilon': 1e-5,
            'scale': True,
            'is_training': is_training,
            'updates_collections': tf.GraphKeys.UPDATE_OPS,
        })

    endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected(
        endpoints['head_output'],
        embedding_dim,
        activation_fn=None,
        weights_initializer=tf.orthogonal_initializer(),
        scope='emb')

    return endpoints