def image_embedding(images, model_fn=resnet_v1_152, trainable=True, is_training=True, weight_decay=0.0001, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True, add_summaries=False, reuse=False): """Extract image features from pretrained resnet model.""" is_resnet_training = trainable and is_training batch_norm_params = { "is_training": is_resnet_training, "trainable": trainable, "decay": batch_norm_decay, "epsilon": batch_norm_epsilon, "scale": batch_norm_scale, } if trainable: weights_regularizer = tf.contrib.layers.l2_regularizer(weight_decay) else: weights_regularizer = None with tf.variable_scope(model_fn.__name__, [images], reuse=reuse) as scope: with slim.arg_scope( [slim.conv2d], weights_regularizer=weights_regularizer, trainable=trainable): with slim.arg_scope( [slim.conv2d], weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], is_training=is_resnet_training, trainable=trainable): with slim.arg_scope([slim.max_pool2d], padding="SAME"): net, end_points = model_fn( images, num_classes=None, global_pool=False, is_training=is_resnet_training, reuse=reuse, scope=scope) if add_summaries: for v in end_points.values(): tf.contrib.layers.summaries.summarize_activation(v) return net
def resnet_arg_scope(weight_decay=0.0001, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): """Defines the default ResNet arg scope. TODO(gpapan): The batch-normalization related default values above are appropriate for use in conjunction with the reference ResNet models released at https://github.com/KaimingHe/deep-residual-networks. When training ResNets from scratch, they might need to be tuned. Args: weight_decay: The weight decay to use for regularizing the model. batch_norm_decay: The moving average decay when estimating layer activation statistics in batch normalization. batch_norm_epsilon: Small constant to prevent division by zero when normalizing activations by their variance in batch normalization. batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the activations in the batch normalization layer. Returns: An `arg_scope` to use for the resnet models. """ batch_norm_params = { 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'updates_collections': tf.GraphKeys.UPDATE_OPS, } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): # The following implies padding='SAME' for pool1, which makes feature # alignment easier for dense prediction tasks. This is also used in # https://github.com/facebook/fb.resnet.torch. However the accompanying # code of 'Deep Residual Learning for Image Recognition' uses # padding='VALID' for pool1. You can switch to that choice by setting # slim.arg_scope([slim.max_pool2d], padding='VALID'). with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: return arg_sc
def resnet_arg_scope(is_training=True, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): batch_norm_params = { 'is_training': False, 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'trainable': False, 'updates_collections': tf.GraphKeys.UPDATE_OPS } with arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), weights_initializer=slim.variance_scaling_initializer(), trainable=is_training, activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: return arg_sc
def _create_baseline(self, n_output=1, n_hidden=100, is_zero_init=False, collection='BASELINE'): # center input h = self._x if self.mean_xs is not None: h -= self.mean_xs if is_zero_init: initializer = init_ops.zeros_initializer() else: initializer = slim.variance_scaling_initializer() with slim.arg_scope([slim.fully_connected], variables_collections=[collection, Q_COLLECTION], trainable=False, weights_initializer=initializer): h = slim.fully_connected(h, n_hidden, activation_fn=tf.nn.tanh) baseline = slim.fully_connected(h, n_output, activation_fn=None) if n_output == 1: baseline = tf.reshape(baseline, [-1]) # very important to reshape return baseline
def _extra_conv_arg_scope_with_bn(weight_decay=0.00001, activation_fn=None, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): batch_norm_params = { 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'updates_collections': tf.GraphKeys.UPDATE_OPS_EXTRA, } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: return arg_sc
def resnet_arg_scope(is_training=True, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): batch_norm_params = { 'is_training': False, 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'trainable': False, 'updates_collections': ops.GraphKeys.UPDATE_OPS } with arg_scope( [slim.conv2d, slim.fully_connected], weights_regularizer=tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), weights_initializer=slim.variance_scaling_initializer(), biases_regularizer=tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), biases_initializer=tf.constant_initializer(0.0), trainable=is_training, activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: return arg_sc
def resnet_arg_scope_bn_trainable(is_training=True, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): batch_norm_params = { 'is_training': True, # Should be always True, otherwise it would have very weird outputs 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'trainable': True, 'updates_collections': tf.GraphKeys.UPDATE_OPS } with arg_scope([slim.conv2d], weights_regularizer=slim.l2_regularizer( cfg.TRAIN.WEIGHT_DECAY), weights_initializer=slim.variance_scaling_initializer(), trainable=is_training, activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: return arg_sc
def resnet_arg_scope(self, is_training=True): ''' In Default, do not use BN to train resnet, since batch_size is too small. So is_training is False and trainable is False in the batch_norm params. ''' batch_norm_params = { 'is_training': False, 'decay': 0.997, 'epsilon': 1e-5, 'scale': True, 'trainable': False, 'updates_collections': tf.GraphKeys.UPDATE_OPS } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(self.weight_decay), weights_initializer=slim.variance_scaling_initializer(), trainable=is_training, activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: return arg_sc
def resnet_arg_scope(is_training=True, weight_decay=cfg.TRAIN.WEIGHT_DECAY, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): batch_norm_params = { 'is_training': cfg.TRAIN.BN_TRAIN and is_training, 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'trainable': cfg.TRAIN.BN_TRAIN, 'updates_collections': tf.GraphKeys.UPDATE_OPS } with arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=slim.variance_scaling_initializer(), trainable=is_training, activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: return arg_sc
def build_fastrcnn(self, feature_to_cropped, rois, img_shape, scope): with tf.variable_scope('Fast-RCNN_{}'.format(scope)): # 5. ROI Pooling with tf.variable_scope('rois_pooling'): pooled_features = self.roi_pooling( feature_maps=feature_to_cropped, rois=rois, img_shape=img_shape) # 6. inferecne rois in Fast-RCNN to obtain fc_flatten features if self.base_network_name.startswith('resnet'): fc_flatten = resnet.restnet_head( input=pooled_features, is_training=self.is_training, scope_name=self.base_network_name, stage=scope) else: raise NotImplementedError('only support resnet and mobilenet') # 7. cls and reg in Fast-RCNN # tf.variance_scaling_initializer() # tf.VarianceScaling() with slim.arg_scope([slim.fully_connected], weights_regularizer=slim.l2_regularizer( cfgs.WEIGHT_DECAY)): if not scope == 'stage3': cls_score = slim.fully_connected( fc_flatten, num_outputs=cfgs.CLASS_NUM + 1, weights_initializer=slim.variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True), activation_fn=None, trainable=self.is_training, scope='cls_fc_h') bbox_pred = slim.fully_connected( fc_flatten, num_outputs=(cfgs.CLASS_NUM + 1) * 5, weights_initializer=slim.variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True), activation_fn=None, trainable=self.is_training, scope='reg_fc_h') # for convient. It also produce (cls_num +1) bboxes cls_score = tf.reshape(cls_score, [-1, cfgs.CLASS_NUM + 1]) bbox_pred = tf.reshape(bbox_pred, [-1, 5 * (cfgs.CLASS_NUM + 1)]) bbox_pred_ins = tf.reshape(bbox_pred, [-1, cfgs.CLASS_NUM + 1, 5]) # only keep a box which score is the bigest keep_abox = tf.argmax(cls_score, axis=1) keep_inds = tf.reshape( tf.transpose( tf.stack([ tf.cumsum(tf.ones_like(keep_abox)) - 1, keep_abox ])), [-1, 2]) bbox_pred_fliter = tf.reshape( tf.gather_nd(bbox_pred_ins, keep_inds), [-1, 5]) return bbox_pred_fliter, bbox_pred, cls_score else: cls_score = slim.fully_connected( fc_flatten, num_outputs=cfgs.CLASS_NUM + 1, weights_initializer=slim.variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True), activation_fn=None, trainable=self.is_training, scope='cls_fc_r') bbox_pred = slim.fully_connected( fc_flatten, num_outputs=(cfgs.CLASS_NUM + 1) * 5, weights_initializer=slim.variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True), activation_fn=None, trainable=self.is_training, scope='reg_fc_r') cls_score = tf.reshape(cls_score, [-1, cfgs.CLASS_NUM + 1]) bbox_pred = tf.reshape(bbox_pred, [-1, 5 * (cfgs.CLASS_NUM + 1)]) return bbox_pred, cls_score
def model_fn(self, is_training=True, *args, **kwargs): # write your own model code # for tensorflow # step 1: unwarp data batch_data = None batch_label = None if len(args) > 0: # for method 2 # on train or test stage, unwarp data from args (which comes from model_input()) if is_training: batch_data, batch_label = args[0].dequeue() else: batch_data = args[0] else: # for method 1 # use placeholder batch_data = tf.placeholder(tf.uint8, shape=[ctx.params.batch_size, ctx.params.input_size, ctx.params.input_size, 1], name='data_node') if not is_training: batch_label = tf.placeholder(tf.int32, shape=[ctx.params.batch_size], name='label_node') # 转换数据类型 batch_data = tf.cast(batch_data, tf.float32) # step 2: building model # 实现LeNet-5卷积神经网络 with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_regularizer=slim.l2_regularizer(0.0001), normalizer_fn=None, weights_initializer=slim.variance_scaling_initializer()): # 卷积层:输入Tensor大小: batch x 28 x 28 x 1; 输出Tensor大小: batch x 24 x 24 x 6 conv_1 = slim.conv2d(batch_data, 6, [5, 5], stride=1, activation_fn=tf.nn.relu, padding='VALID') print(conv_1) # 池化层:输入Tensor大小:batch x 24 x 24 x 6;输出Tensor大小:batch x 12 x 12 x 6 pool_1 = slim.max_pool2d(conv_1, [2, 2], stride=2, padding='VALID') # 卷积层:输入Tensor大小:batch x 12 x 12 x 16;输出Tensor大小:batch x 8 x 8 x 16 conv_2 = slim.conv2d(pool_1, 16, [5, 5], stride=1, activation_fn=tf.nn.relu, padding='VALID') print(conv_2) # 池化层:输入Tensor大小:batch x 8 x 8 x 16;输出Tensor大小:batch x 4 x 4 x 16 pool_2 = slim.max_pool2d(conv_2, [2, 2], stride=2, padding='VALID') # 展开成一维Tensor,输入Tensor大小:batch x 4 x 4 x 16,输出Tensor大小:batch x 256 fc_1 = tf.contrib.layers.flatten(pool_2) # 全连接层:输入Tensor大小:batch x 256;输出Tensor大小:batch x 120 fc_1 = slim.fully_connected(fc_1, 120) # Relu激活层 fc_1 = tf.nn.relu(fc_1) # 全连接层:输入Tensor大小:batch x 120;输出Tensor大小:batch x 84 fc_2 = slim.fully_connected(fc_1, 84) # Relu激活层 fc_2 = tf.nn.relu(fc_2) # 全连接层:输入Tensor大小:batch x 84;输出Tensor大小:batch x 10 (MIMIST 数据集总共10个类别) logits = slim.fully_connected(fc_2, 10) # step 3: output if is_training: # use logits to compute loss # 使用Logits计算交叉熵损失 batch_label_one_hot = slim.one_hot_encoding(batch_label, 10) loss = tf.losses.softmax_cross_entropy(batch_label_one_hot, logits) return loss else: # use logits to compute model predict # 使用Logits计算分类概率 predict = tf.nn.softmax(logits) return predict
def main(): args = parser.parse_args() # We store all arguments in a json file. This has two advantages: # 1. We can always get back and see what exactly that experiment was # 2. We can resume an experiment as-is without needing to remember flags. if args.resume or args.auto_resume: args.experiment_root = utils.select_existing_root(args.experiment_root) args_file = os.path.join(args.experiment_root, 'args.json') if not os.path.isfile(args_file) and not args.auto_resume: # We are not auto_resuming and no existing file was found. This is # an error. raise IOError('`args.json` not found in {}'.format(args_file)) elif not os.path.isfile(args_file) and args.auto_resume: # No existing args file was found, but we are auto resuming, so we # just start a new run. new_run = True else: # We found an existing args file, this can just be used. new_run = False print('Loading args from {}.'.format(args_file)) with open(args_file, 'r') as f: args_resumed = json.load(f) args_resumed['resume'] = True # This would be overwritten. # When resuming, we not only want to populate the args object with # the values from the file, but we also want to check for some # possible conflicts between loaded and given arguments. for key, value in args.__dict__.items(): if key in args_resumed: resumed_value = args_resumed[key] if resumed_value != value: print('Warning: For the argument `{}` we are using the' ' loaded value `{}`. The provided value was `{}`' '.'.format(key, resumed_value, value)) args.__dict__[key] = resumed_value else: print('Warning: A new argument was added since the last run' ': `{}`. Using the new value: `{}`.' ''.format(key, value)) else: # No resuming requested at all. new_run = True if new_run: # If the experiment directory exists already and we are not auto # resuming, we bail in fear. args.experiment_root = utils.select_existing_root( args.experiment_root, check_only_basedir=True) if os.path.exists(args.experiment_root) and not args.auto_resume: if os.listdir(args.experiment_root): print('The directory {} already exists and is not empty.' ' If you want to resume training, append --resume or ' ' --auto_resume to your call.' ''.format(args.experiment_root)) exit(1) elif os.path.exists(args.experiment_root) and args.auto_resume: # If we are auto resuming, it is okay if the directory exists. pass else: # We create a new one if it does not exist. os.makedirs(args.experiment_root) args_file = os.path.join(args.experiment_root, 'args.json') # Make sure the required arguments are provided: # train_set, dataset_root, dataset_config if not args.train_set: parser.print_help() print('You did not specify the `train_set` argument!') exit(1) if not args.dataset_root: parser.print_help() print('You did not specify the required `dataset_root` argument!') exit(1) if not args.dataset_config: parser.print_help() print('You did not specify the required `dataset_config` argument!') exit(1) # Since multiple datasets can be used, we need to check that the # we got lists of the same length train_set_len = len(args.train_set) dataset_root_len = len(args.dataset_config) dataset_config_len = len(args.dataset_config) if args.dataset_weights is not None: dataset_weight_len = len(args.dataset_weights) else: # We'll set this manually later so just use a valid length here. dataset_weight_len = dataset_config_len if (train_set_len != dataset_root_len or train_set_len != dataset_config_len or train_set_len != dataset_weight_len): parser.print_help() print('The dataset specific argument lengths didn\'t match.') exit(1) # Parse the model parameters. This could be a bit cleaner in the future, # but it will do for now. if args.model_params is not None: #model_params = args.model_params.split(';') #if len(model_params) % 2 != 0: # raise ValueError('`model_params` has to be a comma separated ' # 'list of even length.') #it = iter(model_params) #args.model_params = {p: eval(v) for p, v in zip(it,it)} args.model_params = eval(args.model_params) else: args.model_params = {} # Check some parameter clashes. if args.crop_augment > 0 and (args.fixed_crop_augment_width > 0 or args.fixed_crop_augment_height > 0): print('You cannot specified the use of both types of crop ' 'augmentations. Either use the `crop_augment` argument to ' 'remove a fixed amount of pixel from the borders, or use the ' '`fixed_crop_augment_height` arguments to provide a fixed ' 'size window that will be cropped from the input images.') exit(1) if ((args.fixed_crop_augment_height > 0) != (args.fixed_crop_augment_width > 0)): print('You need to specify both the `fixed_crop_augment_width` and ' '`fixed_crop_augment_height` arguments for a valid ' 'augmentation.') exit(1) # Store the passed arguments for later resuming and grepping in a nice # and readable format. with open(args_file, 'w') as f: # Make sure not to store the auto_resume forever though. if 'auto_resume' in args.__dict__: del args.__dict__['auto_resume'] json.dump( vars(args), f, ensure_ascii=False, indent=2, sort_keys=True) log_file = os.path.join(args.experiment_root, 'train') logging.config.dictConfig(utils.get_logging_dict(log_file)) log = logging.getLogger('train') # Also show all parameter values at the start, for ease of reading logs. log.info('Training using the following parameters:') for key, value in sorted(vars(args).items()): log.info('{}: {}'.format(key, value)) # Preload all the filenames and mappings. file_lists = [] dataset_configs = [] for i, (train_set, dataset_root, config) in enumerate( zip(args.train_set, args.dataset_root, args.dataset_config)): # Load the config for the dataset. with open(config, 'r') as f: dataset_configs.append(json.load(f)) log.info('Training set {} based on a `{}` configuration.'.format( i, dataset_configs[-1]['dataset_name'])) # Load the data from the CSV file. file_list = utils.load_dataset(train_set, dataset_root) file_lists.append(file_list) # if not None set based on size if args.dataset_weights is None: dataset_weights = [len(fl) for fl in file_lists] else: dataset_weights = args.dataset_weights # In order to keep the loading of images in tensorflow, we need to make some # quite ugly hacks where we merge all the dataset original to train mappings # into one tensor. Not nice but working. mappings = [d.get('original_to_train_mapping') for d in dataset_configs] mapping = np.zeros( (len(mappings), np.max([len(m) for m in mappings])), dtype=np.int32) for i, m in enumerate(mappings): mapping[i, :len(m)] = m original_to_train_mapping = tf.constant(mapping) dataset = tf.data.Dataset.from_generator( generator=functools.partial( utils.mixed_dataset_generator, file_lists, dataset_weights ), output_types=(tf.string, tf.string, tf.int32)) # Convert filenames to actual image and label id tensors. dataset = dataset.map( lambda x, y, z: tf_utils.string_tuple_to_image_pair( x, y, tf.gather(original_to_train_mapping, z)) + (z,), num_parallel_calls=args.loading_threads) # Possible augmentations if args.flip_augment: dataset = dataset.map( lambda x, y, z: tf_utils.flip_augment(x, y) + (z,)) if args.gamma_augment: dataset = dataset.map( lambda x, y, z: tf_utils.gamma_augment(x, y) + (z,)) # TODO deprecate this. It doesn't file with many datasets. This needs to go. if args.crop_augment > 0: dataset = dataset.map( lambda x, y, z: tf_utils.crop_augment( x, y, args.crop_augment, args.crop_augment) + (z,)) # TODO end if args.fixed_crop_augment_width > 0 and args.fixed_crop_augment_height > 0: dataset = dataset.map( lambda x, y, z: tf_utils.fixed_crop_augment( x, y, args.fixed_crop_augment_height, args.fixed_crop_augment_width) + (z,)) # Re scale the input images dataset = dataset.map(lambda x, y, z: ((x - 128.0) / 128.0, y, z)) # Group it into batches. dataset = dataset.batch(args.batch_size) # Overlap producing and consuming for parallelism. dataset = dataset.prefetch(1) # Since we repeat the data infinitely, we only need a one-shot iterator. image_batch, label_batch, dataset_ids = ( dataset.make_one_shot_iterator().get_next()) # This needs a fixed shape. dataset_ids.set_shape([args.batch_size]) # Feed the image through a model. model = import_module('networks.' + args.model_type) with tf.name_scope('model'): net = model.network(image_batch, is_training=True, **args.model_params) # Generate a logit for every dataset. with tf.name_scope('logits'): logits = [] for d in dataset_configs: logits.append(slim.conv2d( net, len(d['class_names']),[3,3], scope='output_conv_{}'.format(d['dataset_name']), activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), biases_initializer=tf.zeros_initializer())) # Create the loss for every dataset. with tf.name_scope('losses'): loss_function = getattr(output_losses, args.loss_type) weighted_losses = [] for i, dataset_config in enumerate(dataset_configs): mask = tf.equal(dataset_ids, i) weight = tf.cast(tf.reduce_sum(tf.cast(mask, tf.int32)), tf.float32) logit_subset = tf.boolean_mask(logits[i], mask) label_subset = tf.boolean_mask(label_batch, mask) # Do not evaluate the loss for those datasets without images in the # batch. zero_mask = tf.equal(weight, 0) loss = tf.cond( zero_mask, lambda: 0.0, lambda: tf.reduce_mean( loss_function(logit_subset, label_subset, void=dataset_config['void_label']))) # Normalize with prior # loss = tf.divide( # loss, tf.log(float(len(dataset_config['class_names'])))) summary_loss = tf.cond(zero_mask, lambda: np.nan, lambda: loss) tf.summary.scalar( 'loss_{}'.format(dataset_config['dataset_name']), summary_loss) tf.summary.scalar( 'weight_{}'.format(dataset_config['dataset_name']), weight) weighted_losses.append(tf.multiply(loss, weight)) # Merge all the losses together based on how frequent the underlying # datasets are in this batch. loss_mean = tf.divide(tf.add_n(weighted_losses), args.batch_size) # Some logging for tensorboard. tf.summary.scalar('loss', loss_mean) # Define the optimizer and the learning-rate schedule. # Unfortunately, we get NaNs if we don't handle no-decay separately. global_step = tf.Variable(0, name='global_step', trainable=False) if 0 <= args.decay_start_iteration < args.train_iterations: learning_rate = tf.train.exponential_decay( args.learning_rate, tf.maximum(0, global_step - args.decay_start_iteration), args.train_iterations - args.decay_start_iteration, args.decay_multiplier) else: learning_rate = args.learning_rate tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) # Update_ops are used to update batchnorm stats. with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize(loss_mean, global_step=global_step) # Define a saver for the complete model. checkpoint_saver = tf.train.Saver(max_to_keep=0) with tf.Session() as sess: if args.resume: # In case we're resuming, simply load the full checkpoint to init. last_checkpoint = tf.train.latest_checkpoint(args.experiment_root) log.info('Restoring from checkpoint: {}'.format(last_checkpoint)) checkpoint_saver.restore(sess, last_checkpoint) else: # Initialize all variables sess.run(tf.global_variables_initializer()) # We also store this initialization as a checkpoint, such that we # could run exactly reproducible experiments. checkpoint_saver.save(sess, os.path.join( args.experiment_root, 'checkpoint'), global_step=0) merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(args.experiment_root, sess.graph) start_step = sess.run(global_step) log.info('Starting training from iteration {}.'.format(start_step)) # Finally, here comes the main-loop. This `Uninterrupt` is a handy # utility such that an iteration still finishes on Ctrl+C and we can # stop the training cleanly. with utils.Uninterrupt(sigs=[SIGINT, SIGTERM], verbose=True) as u: for i in range(start_step, args.train_iterations): # Compute gradients, update weights, store logs! start_time = time.time() _, summary, step = sess.run( [train_op, merged_summary, global_step]) elapsed_time = time.time() - start_time # Compute the iteration speed and add it to the summary. # We did observe some weird spikes that we couldn't track down. summary2 = tf.Summary() summary2.value.add( tag='secs_per_iter', simple_value=elapsed_time) summary_writer.add_summary(summary2, step) summary_writer.add_summary(summary, step) # Save a checkpoint of training every so often. if (args.checkpoint_frequency > 0 and step % args.checkpoint_frequency == 0): checkpoint_saver.save(sess, os.path.join( args.experiment_root, 'checkpoint'), global_step=step) # Stop the main-loop at the end of the step, if requested. if u.interrupted: log.info('Interrupted on request!') break # Store one final checkpoint. This might be redundant, but it is crucial # in case intermediate storing was disabled and it saves a checkpoint # when the process was interrupted. checkpoint_saver.save(sess, os.path.join( args.experiment_root, 'checkpoint'), global_step=step)
def main(): args = parser.parse_args() # We store all arguments in a json file. This has two advantages: # 1. We can always get back and see what exactly that experiment was # 2. We can resume an experiment as-is without needing to remember all flags. args_file = os.path.join(args.experiment_root, 'args.json') if args.resume: if not os.path.isfile(args_file): raise IOError('`args.json` not found in {}'.format(args_file)) print('Loading args from {}.'.format(args_file)) with open(args_file, 'r') as f: args_resumed = json.load(f) args_resumed['resume'] = True # This would be overwritten. # When resuming, we not only want to populate the args object with the # values from the file, but we also want to check for some possible # conflicts between loaded and given arguments. for key, value in args.__dict__.items(): if key in args_resumed: resumed_value = args_resumed[key] if resumed_value != value: print('Warning: For the argument `{}` we are using the' ' loaded value `{}`. The provided value was `{}`' '.'.format(key, resumed_value, value)) args.__dict__[key] = resumed_value else: print('Warning: A new argument was added since the last run:' ' `{}`. Using the new value: `{}`.'.format(key, value)) else: # Make sure the required arguments are provided: # train_set, dataset_root, dataset_config if not args.train_set: parser.print_help() print('You did not specify the `train_set` argument!') exit(1) if not args.dataset_root: parser.print_help() print('You did not specify the required `dataset_root` argument!') exit(1) if not args.dataset_config: parser.print_help() print( 'You did not specify the required `dataset_config` argument!') exit(1) # If the experiment directory exists already, we bail in fear. if os.path.exists(args.experiment_root): if os.listdir(args.experiment_root): print('The directory {} already exists and is not empty.' ' If you want to resume training, append --resume to' ' your call.'.format(args.experiment_root)) exit(1) else: os.makedirs(args.experiment_root) # Parse the model parameters. This could be a bit cleaner in the future, # but it will do for now. if args.model_params is not None: model_params = args.model_params.split(',') if len(model_params) % 2 != 0: raise ValueError('`model_params` has to be a comma separated ' 'list of even length.') it = iter(model_params) args.model_params = {p: int(v) for p, v in zip(it, it)} else: args.model_params = {} # Check some parameter clashes. if args.crop_augment > 0 and (args.fixed_crop_augment_width > 0 or args.fixed_crop_augment_height > 0): print( 'You cannot specified the use of both types of crop ' 'augmentations. Either use the `crop_augment` argument to ' 'remove a fixed amount of pixel from the borders, or use the ' '`fixed_crop_augment_height` arguments to provide a fixed ' 'size window that will be cropped from the input images.') exit(1) if ((args.fixed_crop_augment_height > 0) != (args.fixed_crop_augment_width > 0)): print( 'You need to specify both the `fixed_crop_augment_width` and ' '`fixed_crop_augment_height` arguments for a valid ' 'augmentation.') exit(1) # Store the passed arguments for later resuming and grepping in a nice # and readable format. with open(args_file, 'w') as f: json.dump(vars(args), f, ensure_ascii=False, indent=2, sort_keys=True) log_file = os.path.join(args.experiment_root, 'train') logging.config.dictConfig(utils.get_logging_dict(log_file)) log = logging.getLogger('train') # Also show all parameter values at the start, for ease of reading logs. log.info('Training using the following parameters:') for key, value in sorted(vars(args).items()): log.info('{}: {}'.format(key, value)) # Load the config for the dataset. with open(args.dataset_config, 'r') as f: dataset_config = json.load(f) log.info('Training based on a `{}` configuration.'.format( dataset_config['dataset_name'])) # Load the data from the CSV file. image_files, label_files = utils.load_dataset(args.train_set, args.dataset_root) # Setup a tf.Dataset where one "epoch" loops over all images. # images are shuffled after every epoch and continue indefinitely. images = tf.data.Dataset.from_tensor_slices(image_files) labels = tf.data.Dataset.from_tensor_slices(label_files) dataset = tf.data.Dataset.zip((images, labels)) dataset = dataset.shuffle(len(image_files)) dataset = dataset.repeat(None) # Repeat forever. # Convert filenames to actual image and label id tensors. dataset = dataset.map(lambda x, y: tf_utils.string_tuple_to_image_pair( x, y, dataset_config.get('original_to_train_mapping', None)), num_parallel_calls=args.loading_threads) # Possible augmentations if args.flip_augment: dataset = dataset.map(tf_utils.flip_augment) if args.gamma_augment: dataset = dataset.map(tf_utils.gamma_augment) if args.crop_augment > 0: dataset = dataset.map(lambda x, y: tf_utils.crop_augment( x, y, args.crop_augment, args.crop_augment)) if args.fixed_crop_augment_width > 0 and args.fixed_crop_augment_height > 0: dataset = dataset.map(lambda x, y: tf_utils.fixed_crop_augment( x, y, args.fixed_crop_augment_height, args.fixed_crop_augment_width )) # Re scale the input images dataset = dataset.map(lambda x, y: ((x - 128.0) / 128.0, y)) # Group it into batches. dataset = dataset.batch(args.batch_size) # Overlap producing and consuming for parallelism. dataset = dataset.prefetch(1) # Since we repeat the data infinitely, we only need a one-shot iterator. image_batch, label_batch = dataset.make_one_shot_iterator().get_next() model = import_module('networks.' + args.model_type) # Feed the image through a model. with tf.name_scope('model'): net = model.network(image_batch, is_training=True, **args.model_params) logits = slim.conv2d( net, len(dataset_config['class_names']), [3, 3], scope='output_conv', activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), biases_initializer=tf.zeros_initializer()) # Create the loss, for now we use a simple cross entropy loss. with tf.name_scope('loss'): loss_function = getattr(output_losses, args.loss_type) losses = loss_function(logits, label_batch, void=dataset_config['void_label']) # Count the total batch loss. loss_mean = tf.reduce_mean(losses) # Some logging for tensorboard. tf.summary.histogram('loss_distribution', losses) tf.summary.scalar('loss', loss_mean) # Define the optimizer and the learning-rate schedule. # Unfortunately, we get NaNs if we don't handle no-decay separately. global_step = tf.Variable(0, name='global_step', trainable=False) if 0 <= args.decay_start_iteration < args.train_iterations: learning_rate = tf.train.exponential_decay( args.learning_rate, tf.maximum(0, global_step - args.decay_start_iteration), args.train_iterations - args.decay_start_iteration, args.decay_multiplier) else: learning_rate = args.learning_rate tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) # Update_ops are used to update batchnorm stats. with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize(loss_mean, global_step=global_step) # Define a saver for the complete model. checkpoint_saver = tf.train.Saver(max_to_keep=0) with tf.Session() as sess: if args.resume: # In case we're resuming, simply load the full checkpoint to init. last_checkpoint = tf.train.latest_checkpoint(args.experiment_root) log.info('Restoring from checkpoint: {}'.format(last_checkpoint)) checkpoint_saver.restore(sess, last_checkpoint) else: # Initialize all variables sess.run(tf.global_variables_initializer()) # We also store this initialization as a checkpoint, such that we # could run exactly reproduceable experiments. checkpoint_saver.save(sess, os.path.join(args.experiment_root, 'checkpoint'), global_step=0) merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(args.experiment_root, sess.graph) start_step = sess.run(global_step) log.info('Starting training from iteration {}.'.format(start_step)) # Finally, here comes the main-loop. This `Uninterrupt` is a handy # utility such that an iteration still finishes on Ctrl+C and we can # stop the training cleanly. with utils.Uninterrupt(sigs=[SIGINT, SIGTERM], verbose=True) as u: for i in range(start_step, args.train_iterations): # Compute gradients, update weights, store logs! start_time = time.time() _, summary, step = sess.run( [train_op, merged_summary, global_step]) elapsed_time = time.time() - start_time # Compute the iteration speed and add it to the summary. # We did observe some weird spikes that we couldn't track down. summary2 = tf.Summary() summary2.value.add(tag='secs_per_iter', simple_value=elapsed_time) summary_writer.add_summary(summary2, step) summary_writer.add_summary(summary, step) # Save a checkpoint of training every so often. if (args.checkpoint_frequency > 0 and step % args.checkpoint_frequency == 0): checkpoint_saver.save(sess, os.path.join(args.experiment_root, 'checkpoint'), global_step=step) # Stop the main-loop at the end of the step, if requested. if u.interrupted: log.info('Interrupted on request!') break # Store one final checkpoint. This might be redundant, but it is crucial # in case intermediate storing was disabled and it saves a checkpoint # when the process was interrupted. checkpoint_saver.save(sess, os.path.join(args.experiment_root, 'checkpoint'), global_step=step)
def head(endpoints, embedding_dim, is_training): batch_norm_params = { 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'updates_collections': tf.GraphKeys.UPDATE_OPS, 'fused': None, } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(0.0), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): # attention_projection = slim.conv2d(endpoints['Mixed_7d'], 512, [1, 1], scope='attention_projection') masks = [] masked_maps = [] for i in range(head_num): attention_branch_mask = attention_branch( endpoints['Mixed_7d'], i) # attention_branch_mask = attention_branch(attention_projection, i) masks.append(attention_branch_mask) endpoints['attention_mask{}'.format(i)] = attention_branch_mask masked_map = (1 + attention_branch_mask) * endpoints['Mixed_7d'] # masked_map = (1 + attention_branch_mask) * attention_projection masked_maps.append(masked_map) for i in range(head_num): for j in range(i + 1, head_num): cosine_similarity(masks[i], masks[j], 'constraint_{}{}'.format(i, j)) _masked = tf.concat(masked_maps, axis=3, name='concated_mask') endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean( _masked, [1, 2], name='_pool5', keep_dims=False) endpoints['head_output'] = slim.fully_connected( endpoints['model_output'], 1024, normalizer_fn=slim.batch_norm, normalizer_params={ 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS, }) endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected( endpoints['head_output'], embedding_dim, activation_fn=None, weights_initializer=tf.orthogonal_initializer(), scope='emb') return endpoints
def _recognition_network(self, sampler=None, log_likelihood_func=None): """x values -> samples from Q and return log Q(h|x).""" samples = {} reuse = None if not self.run_recognition_network else True # Set defaults if sampler is None: sampler = self._random_sample if log_likelihood_func is None: log_likelihood_func = lambda sample, log_params: ( U.binary_log_likelihood(sample['activation'], log_params)) logQ = [] if self.hparams.task in ['sbn', 'omni']: # Initialize the edge case samples[-1] = {'activation': self._x} if self.mean_xs is not None: samples[-1]['activation'] -= self.mean_xs # center the input samples[-1]['activation'] = (samples[-1]['activation'] + 1) / 2.0 with slim.arg_scope( [slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[Q_COLLECTION]): for i in xrange(self.hparams.n_layer): # Set up the input to the layer input = 2.0 * samples[i - 1]['activation'] - 1.0 # Create the conditional distribution (output is the logits) h = self._create_transformation( input, n_output=self.hparams.n_hidden, reuse=reuse, scope_prefix='q_%d' % i) samples[i] = sampler(h, self.uniform_samples[i], i) logQ.append(log_likelihood_func(samples[i], h)) self.run_recognition_network = True return logQ, samples elif self.hparams.task == 'sp': # Initialize the edge case samples[-1] = { 'activation': tf.split(self._x, num_or_size_splits=2, axis=1)[0] } # top half of digit if self.mean_xs is not None: samples[-1]['activation'] -= np.split(self.mean_xs, 2, 0)[0] # center the input samples[-1]['activation'] = (samples[-1]['activation'] + 1) / 2.0 with slim.arg_scope( [slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[Q_COLLECTION]): for i in xrange(self.hparams.n_layer): # Set up the input to the layer input = 2.0 * samples[i - 1]['activation'] - 1.0 # Create the conditional distribution (output is the logits) h = self._create_transformation( input, n_output=self.hparams.n_hidden, reuse=reuse, scope_prefix='q_%d' % i) samples[i] = sampler(h, self.uniform_samples[i], i) logQ.append(log_likelihood_func(samples[i], h)) self.run_recognition_network = True return logQ, samples
def head(endpoints, embedding_dim, is_training): M = 5 L = M * M D = 64 dim = [L, D] attention_steps = 16 # endpoints['resnet_v2_50/block4'] is in shape of (?, 7, 7, 2048) batch_norm_params = { 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'updates_collections': tf.GraphKeys.UPDATE_OPS, 'fused': None, } with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_regularizer=slim.l2_regularizer(0.0), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): attention_branch_conv = slim.conv2d(endpoints['Mixed_7d'], dim[1], [1, 1], scope='attention_branch_conv') # create a BasicRNNCell features = tf.reshape(attention_branch_conv, [-1, dim[0], dim[1]], name='attention_branch_features') a_i = tf.reshape(features, [-1, dim[1]]) a_i = slim.fully_connected(inputs=a_i, num_outputs=dim[1], biases_initializer=None, scope='a_i') a_i = tf.reshape(a_i, [-1, dim[0], dim[1]]) gru_cell = tf.contrib.rnn.GRUCell(num_units=dim[1]) # defining initial state # state = gru_cell.zero_state(tf.shape(endpoints['resnet_v2_50/block4'])[0], dtype=tf.float32) _input = tf.reduce_mean(features, 1) state = slim.fully_connected(inputs=tf.reduce_mean(features, 1), num_outputs=D, biases_initializer=None, scope='init_state') attention_maps = [] _masked = [] _masked.append(features) with tf.variable_scope("GRU_Attention"): for i in range(attention_steps): if i > 0: tf.get_variable_scope().reuse_variables() # state is in shape (?, 64) output, state = gru_cell(_input, state) h = tf.expand_dims( slim.fully_connected(inputs=state, num_outputs=dim[1], biases_initializer=None, scope='hidden2h'), 1) e = tf.reshape(tf.add(a_i, h), [-1, dim[1]]) _att = slim.fully_connected(inputs=e, num_outputs=1, scope='e2attention') _alpha = tf.nn.softmax(tf.reshape(_att, [-1, dim[0]])) attention_maps.append(_alpha) _mask = tf.multiply(features, tf.expand_dims(_alpha, 2)) _masked.append(_mask) _input = tf.reduce_sum(_mask, 1) ''' for i in range(attention_steps - 1): if i > 0: tf.get_variable_scope().reuse_variables() _inputs.append(_input) output, state = gru_cell(_input, state) h = tf.expand_dims(slim.fully_connected(inputs=state, num_outputs=dim[1], biases_initializer=None, scope='hidden2h'), 1) e = tf.reshape(tf.add(a_i, h), [-1, dim[1]]) _att = slim.fully_connected(inputs=e, num_outputs=1, scope='e2attention') _alpha = tf.nn.softmax(tf.reshape(_att, [-1, dim[0]])) attention_maps.append(_alpha) _input = tf.reduce_sum(tf.multiply(features, tf.expand_dims(_alpha, 2)), 1) ''' _mask_concat = tf.concat(_masked[:-1], 2) _masked = tf.reshape(_mask_concat, [-1, M, M, attention_steps * dim[1]]) endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean( _masked, [1, 2], name='_pool5', keep_dims=False) endpoints['head_output'] = slim.fully_connected( endpoints['model_output'], 1024, normalizer_fn=slim.batch_norm, normalizer_params={ 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS, }) endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected( endpoints['head_output'], embedding_dim, activation_fn=None, weights_initializer=tf.orthogonal_initializer(), scope='emb') return endpoints
def build_bisenet3(inputs, num_classes, preset_model='DepthwiseAAFF', frontend="xception", weight_decay=1e-5, is_training=True, pretrained_dir="models"): initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False) ### The spatial path ### The number of feature maps for each convolution is not specified in the paper ### It was chosen here to be equal to the number of feature maps of a classification ### model at each corresponding stage # depth-wise convolution point_filter1 = tf.get_variable(name="point_filter1", shape=(1, 1, 64, 128), initializer=initializer) point_filter2 = tf.get_variable(name="point_filter2", shape=(1, 1, 128, 256), initializer=initializer) filter1 = tf.get_variable(name="filter1", shape=(3, 3, 64, 1), initializer=initializer) filter2 = tf.get_variable(name="filter2", shape=(3, 3, 128, 1), initializer=initializer) # spatial path spatial_net = ConvBlock(inputs, n_filters=64, kernel_size=[3, 3], strides=2) spatial_net = tf.nn.separable_conv2d(input=spatial_net, depthwise_filter=filter1, pointwise_filter=point_filter1, strides=[1, 2, 2, 1], rate=[1, 1], padding='SAME') spatial_net = tf.nn.separable_conv2d(input=spatial_net, depthwise_filter=filter2, pointwise_filter=point_filter2, strides=[1, 2, 2, 1], rate=[1, 1], padding='SAME') spatial_net = ConvBlock(spatial_net, n_filters=32, kernel_size=[1, 1]) # Context path logits, end_points, frontend_scope, init_fn = frontend_builder.build_frontend( inputs, frontend, pretrained_dir=pretrained_dir, is_training=is_training) size = tf.shape(end_points['pool5'])[1:3] net_1 = AttentionAndFeatureFussion(end_points['pool3'], end_points['pool4'], 64) net_2 = AttentionAndFeatureFussion(net_1, end_points['pool5'], 128) net_2 = Upsampling(net_2, scale=2) net_1_2 = tf.concat([net_1, net_2], axis=-1) net_1_2 = Upsampling(net_1_2, scale=2) net_1_2_3 = tf.concat([net_1_2, end_points['pool3']], axis=-1) net_1_2_3 = ConvBlock(net_1_2_3, n_filters=128, kernel_size=[1, 1], strides=1) context_path_left = AttentionRefinementModule(net_1_2_3, n_filters=128) net_3 = AttentionAndFeatureFussion(end_points['pool3'], end_points['pool4'], 64) net_4 = AttentionAndFeatureFussion(net_3, end_points['pool5'], 128) net_4 = Upsampling(net_4, scale=2) net_3_4 = tf.concat([net_3, net_4], axis=-1) net_3_4 = Upsampling(net_3_4, scale=2) net_3_4_5 = tf.concat([net_3_4, end_points['pool3']], axis=-1) net_3_4_5 = ConvBlock(net_3_4_5, n_filters=128, kernel_size=[1, 1], strides=1) context_path_right = AttentionRefinementModule(net_3_4_5, n_filters=128) ### Combining the paths net = FeatureFusionModule(input_1=context_path_left, input_2=context_path_right, input_3=spatial_net, n_filters=256) net = ConvBlock(net, n_filters=64, kernel_size=[3, 3]) ### Final upscaling and finish # Upsampling + dilation or only Upsampling net = Upsampling(net, scale=2) net = slim.conv2d(net, 64, [3, 3], rate=2, activation_fn=tf.nn.relu, biases_initializer=None, normalizer_fn=slim.batch_norm) net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, scope='logits') net = Upsampling(net, 4) return net, init_fn
def convolutional_alexnet_arg_scope(embed_config, trainable=True, is_training=False): """Defines the default arg scope. Args: embed_config: A dictionary which contains configurations for the embedding function. trainable: If the weights in the embedding function is trainable. is_training: If the embedding function is built for training. Returns: An `arg_scope` to use for the convolutional_alexnet models. """ # Only consider the model to be in training mode if it's trainable. # This is vital for batch_norm since moving_mean and moving_variance # will get updated even if not trainable. is_model_training = trainable and is_training if get(embed_config, 'use_bn', True): #print("========= use bn") batch_norm_scale = get(embed_config, 'bn_scale', True) batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4) batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6) batch_norm_params = { "scale": batch_norm_scale, # Decay for the moving averages. "decay": batch_norm_decay, # Epsilon to prevent 0s in variance. "epsilon": batch_norm_epsilon, "trainable": trainable, "is_training": is_model_training, # Collection containing the moving mean and moving variance. "variables_collections": { "beta": None, "gamma": None, "moving_mean": ["moving_vars"], "moving_variance": ["moving_vars"], }, 'updates_collections': None, # Ensure that updates are done within a frame } normalizer_fn = slim.batch_norm else: batch_norm_params = {} normalizer_fn = None weight_decay = get(embed_config, 'weight_decay', 5e-4) if trainable: weights_regularizer = slim.l2_regularizer(weight_decay) else: weights_regularizer = None init_method = get(embed_config, 'init_method', 'kaiming_normal') if is_model_training: logging.info('embedding init method -- {}'.format(init_method)) if init_method == 'kaiming_normal': # The same setting as siamese-fc initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False) else: initializer = slim.xavier_initializer() with slim.arg_scope( [slim.conv2d], # no slim.separable_conv2d weights_regularizer=weights_regularizer, weights_initializer=initializer, padding='VALID', trainable=trainable, activation_fn=tf.nn.relu, normalizer_fn=normalizer_fn, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc: return arg_sc
def inception(input, is_training): weight_decay = 0.0005 keep_prob = 0.5 ##batch normalization 參數定義 batch_norm_decay = 0.996 batch_norm_epsilon = 1e-5 batch_norm_scale = True batch_norm_params = { 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'updates_collections': tf.GraphKeys.UPDATE_OPS, 'is_training': is_training } ## CNN 架構 with slim.arg_scope( [slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=slim.variance_scaling_initializer(), normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.dropout], keep_prob=keep_prob, is_training=is_training): with slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=[2, 2]): with slim.arg_scope([slim.conv2d], padding='SAME'): net = slim.conv2d(input, 4, [3, 3]) net = slim.conv2d(net, 8, [3, 3]) net = slim.conv2d(net, 16, [3, 3]) net = slim.max_pool2d(net) with slim.arg_scope( [slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): with tf.variable_scope('Mixed_1'): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 8, [1, 1]) with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 16, [1, 1]) branch_1 = slim.conv2d(branch_1, 32, [3, 3]) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 8, [1, 1]) branch_2 = slim.conv2d(branch_2, 16, [5, 5]) with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [2, 2], stride=[1, 1], padding='SAME') branch_3 = slim.conv2d(branch_3, 16, [1, 1]) net = tf.concat([branch_0, branch_1, branch_2, branch_3], axis=3) with tf.variable_scope('Mixed_2'): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 16, [1, 1]) with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 32, [1, 1]) branch_1 = slim.conv2d(branch_1, 64, [3, 3]) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1]) branch_2 = slim.conv2d(branch_2, 64, [5, 5]) with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [2, 2], stride=[1, 1], padding='SAME') branch_3 = slim.conv2d(branch_3, 32, [1, 1]) net = tf.concat([branch_0, branch_1, branch_2, branch_3], axis=3) net = slim.conv2d(net, 2, [1, 1], activation_fn=None) net = slim.avg_pool2d(net, kernel_size=[net.shape[1], net.shape[2]], stride=[1, 1], padding='VALID') net = tf.reshape(net, [-1, 2]) logits = tf.nn.softmax(net) return logits
def network(input, is_training, base_channel_count=48, bottleneck_blocks=False, separable_conv=False, gn_groups=None, gn_channels=None): '''ResNet v2 style semantic segmentation network with long range skips. Args: Returns: ''' conv2d_params = { 'padding': 'SAME', 'weights_initializer': slim.variance_scaling_initializer(), 'biases_initializer': None, 'activation_fn': None, 'normalizer_fn': None } if gn_groups is not None or gn_channels is not None: normalziation_params = { 'group_count': gn_groups, 'channel_count': gn_channels } norm_op = tf_utils.group_normalization else: normalziation_params = { 'center': True, 'scale': True, 'decay': 0.9, 'epsilon': 1e-5, 'is_training': is_training } norm_op = slim.batch_norm if separable_conv: separable_conv2d_params = dict(conv2d_params) separable_conv2d_params['depth_multiplier'] = 1 conv_op = slim.separable_conv2d else: separable_conv2d_params = {} conv_op = slim.conv2d with slim.arg_scope([slim.conv2d], **conv2d_params): with slim.arg_scope([slim.separable_conv2d], **separable_conv2d_params): with slim.arg_scope([norm_op], **normalziation_params): # First convolution to increase the channel count. net = slim.conv2d(input, base_channel_count, [3, 3], scope='input_conv') # 2 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=1, scope='resblock_v2_1', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=1, scope='resblock_v2_2', bottleneck=bottleneck_blocks) skip0 = net # Pooling -> 1/2 res net = slim.max_pool2d(net, [2, 2], padding='SAME') # 3 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=2, scope='resblock_v2_3', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=2, scope='resblock_v2_4', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=2, scope='resblock_v2_5', bottleneck=bottleneck_blocks) skip1 = net # Pooling -> 1/4 res net = slim.max_pool2d(net, [2, 2], padding='SAME') # 4 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_6', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_7', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_8', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_9', bottleneck=bottleneck_blocks) skip2 = net # Pooling -> 1/8 res net = slim.max_pool2d(net, [2, 2], padding='SAME') # 2 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=8, scope='resblock_v2_10', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=8, scope='resblock_v2_11', bottleneck=bottleneck_blocks) skip3 = net # Pooling -> 1/16 res net = slim.max_pool2d(net, [2, 2], padding='SAME') # 2 ResBlocks net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=8, scope='resblock_v2_12', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=8, scope='resblock_v2_13', bottleneck=bottleneck_blocks) # Unpool, crop and concatenate the skip connection net = tf.image.resize_nearest_neighbor( net, [tf.shape(net)[1] * 2, tf.shape(net)[2] * 2]) net = net[:, :tf.shape(skip3)[1], :tf.shape(skip3)[2], :] net = tf.concat([net, skip3], axis=-1) # 2 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_14', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_15', bottleneck=bottleneck_blocks) # Unpool, crop and concatenate the skip connection net = tf.image.resize_nearest_neighbor( net, [tf.shape(net)[1] * 2, tf.shape(net)[2] * 2]) net = net[:, :tf.shape(skip2)[1], :tf.shape(skip2)[2], :] net = tf.concat([net, skip2], axis=-1) # 2 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_16', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=4, scope='resblock_v2_17', bottleneck=bottleneck_blocks) # Unpool, crop and concatenate the skip connection net = tf.image.resize_nearest_neighbor( net, [tf.shape(net)[1] * 2, tf.shape(net)[2] * 2]) net = net[:, :tf.shape(skip1)[1], :tf.shape(skip1)[2], :] net = tf.concat([net, skip1], axis=-1) # 2 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=2, scope='resblock_v2_18', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=2, scope='resblock_v2_19', bottleneck=bottleneck_blocks) # Unpool, crop and concatenate the skip connection net = tf.image.resize_nearest_neighbor( net, [tf.shape(net)[1] * 2, tf.shape(net)[2] * 2]) net = net[:, :tf.shape(skip0)[1], :tf.shape(skip0)[2], :] net = tf.concat([net, skip0], axis=-1) # 2 ResBlocks, store the output for the skip connection net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=1, scope='resblock_v2_20', bottleneck=bottleneck_blocks) net = res_block_v2(net, base_channel_count, conv_op, norm_op, channel_multiplier=1, scope='resblock_v2_21', bottleneck=bottleneck_blocks) # Final batchnorm and relu before the prediction. net = slim.batch_norm(net) net = tf.nn.relu(net) return net
def build_graph(reader, model, eval_data_pattern, label_loss_fn, batch_size=1024, num_readers=1): """Creates the Tensorflow graph for evaluation. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. eval_data_pattern: glob path to the evaluation data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. num_readers: How many threads to use for I/O operations. """ global_step = tf.Variable(0, trainable=False, name="global_step") input_data_dict = get_input_evaluation_tensors(reader, eval_data_pattern, batch_size=batch_size, num_readers=num_readers) video_id_batch = input_data_dict["video_ids"] model_input_raw = input_data_dict["video_matrix"] labels_batch = input_data_dict["labels"] num_frames = input_data_dict["num_frames"] tf.summary.histogram("model_input_raw", model_input_raw) local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == "GPU"] gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = "/gpu:%d" else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = "/cpu:%d" print("flags!!!", device_string) # feature_dim = len(model_input_raw.get_shape()) - 1 # Normalize input features. # model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) if FLAGS.segment_labels: label_weights = input_data_dict["label_weights"] else: label_weights = None offset = np.array([4. / 512] * 1024 + [0] * 128) offset = tf.constant(offset, dtype=tf.float32) eigen_val = tf.constant(np.sqrt( np.load("yt8m_pca/eigenvals.npy")[:1024, 0]), dtype=tf.float32) model_input = tf.multiply( model_input_raw - offset, tf.pad(eigen_val + 1e-4, [[0, 128]], constant_values=1.)) tower_logits = [] for i in range(num_towers): with tf.device(device_string % i): with tf.variable_scope("tower_%d" % i, reuse=False): result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, is_training=False) logits = result["logits"] tower_logits.append(logits) with tf.device(device_string % 0): with tf.variable_scope("ensemble"): ftr_mean = tf.reduce_mean(model_input, axis=1) print("ftr mean shape: ", ftr_mean.get_shape().as_list()) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=False, is_training=False, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, num_towers, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) logits = tf.stack(tower_logits, axis=1) final_logit = tf.reduce_sum(tf.multiply( logits, tf.expand_dims(mix_weights, axis=-1)), axis=1, keepdims=False) final_predictions = tf.nn.sigmoid(final_logit) final_label_loss = label_loss_fn.calculate_loss( final_predictions, labels_batch, label_weights=label_weights) tf.summary.scalar("label_loss", final_label_loss) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", final_label_loss) tf.add_to_collection("predictions", final_predictions) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("video_id_batch", video_id_batch) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) if FLAGS.segment_labels: tf.add_to_collection("label_weights", input_data_dict["label_weights"]) tf.add_to_collection("summary_op", tf.summary.merge_all())
def net_structure(img1, img2): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], # He (aka MSRA) weight initialization weights_initializer=slim.variance_scaling_initializer(), activation_fn=LeakyReLU, # We will do our own padding to match the original Caffe code padding='VALID'): weights_regularizer = slim.l2_regularizer(weight_decay) with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): with slim.arg_scope([slim.conv2d], stride=2): conv_a_1 = slim.conv2d(pad(img1, 3), 64, 7, scope='conv1') conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope='conv2') conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope='conv3') conv_b_1 = slim.conv2d(pad(img2, 3), 64, 7, scope='conv1', reuse=True) conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope='conv2', reuse=True) conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope='conv3', reuse=True) # Compute cross correlation with leaky relu activation cc = correlation.correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20) cc_relu = LeakyReLU(cc) # Combine cross correlation results with convolution of feature map A netA_conv = slim.conv2d(conv_a_3, 32, 1, scope='conv_redir') # Concatenate along the channels axis net = tf.concat([netA_conv, cc_relu], axis=3) conv3_1 = slim.conv2d(pad(net), 256, 3, scope='conv3_1') with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3): conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4') conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1') conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5') conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1') conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6') conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1') """ START: Refinement Network """ with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None): predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3, scope='predict_flow6', activation_fn=None) deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4, stride=2, scope='deconv5')) upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4, stride=2, scope='upsample_flow6to5', activation_fn=None)) concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3) predict_flow5 = slim.conv2d(pad(concat5), 2, 3, scope='predict_flow5', activation_fn=None) deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4, stride=2, scope='deconv4')) upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4, stride=2, scope='upsample_flow5to4', activation_fn=None)) concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3) predict_flow4 = slim.conv2d(pad(concat4), 2, 3, scope='predict_flow4', activation_fn=None) deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4, stride=2, scope='deconv3')) upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4, stride=2, scope='upsample_flow4to3', activation_fn=None)) concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3) predict_flow3 = slim.conv2d(pad(concat3), 2, 3, scope='predict_flow3', activation_fn=None) deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4, stride=2, scope='deconv2')) upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4, stride=2, scope='upsample_flow3to2', activation_fn=None)) concat2 = tf.concat([conv_a_2, deconv2, upsample_flow3to2], axis=3) predict_flow2 = slim.conv2d(pad(concat2), 2, 3, scope='predict_flow2', activation_fn=None) """ END: Refinement Network """ '''new loss''' # target_height, target_width = int(predict_flow2.shape[1].value), int(predict_flow2.shape[2].value) # predict_flow6 = tf.image.resize_bilinear(predict_flow6, # tf.stack([target_height, target_width]), # align_corners=True) # predict_flow5 = tf.image.resize_bilinear(predict_flow5, # tf.stack([target_height, target_width]), # align_corners=True) # predict_flow4 = tf.image.resize_bilinear(predict_flow4, # tf.stack([target_height, target_width]), # align_corners=True) # predict_flow3 = tf.image.resize_bilinear(predict_flow3, # tf.stack([target_height, target_width]), # align_corners=True) # predict = tf.concat([predict_flow5, predict_flow4, predict_flow3, predict_flow2], axis=3) # flow = predict * 20.0 # flow_temp0 = slim.conv2d(pad(predict), num_outputs=2, kernel_size=2, stride=1, scope='flow_temp0') # flow_temp = tf.image.resize_bilinear(flow_temp0, # tf.stack([img_height, img_width]), # align_corners=True) # flow = flow_temp * 20.0 flow = predict_flow2 * 20.0 # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different flow = tf.image.resize_bilinear(flow, tf.stack([img_height, img_width]), align_corners=True) return { 'predict_flow6': predict_flow6, 'predict_flow5': predict_flow5, 'predict_flow4': predict_flow4, 'predict_flow3': predict_flow3, 'predict_flow2': predict_flow2, 'flow': flow, }
def netbody(img, reuse=False): with tf.variable_scope('ResNet', reuse=reuse): net = img with slim.arg_scope( [slim.conv2d], padding='SAME', kernel_size=[3, 3], activation_fn=tf.nn.relu, weights_initializer=slim.variance_scaling_initializer( ), normalizer_fn=self.BN if self.bn else None, normalizer_params={ 'is_training': is_training, 'decay': self.bn_decay, 'reuse': reuse } if self.bn else None): net = slim.conv2d(net, self.chnl['block1'], scope='conv1') shortcut = net # ep['conv1'] = net for blk, name in enumerate(self.block): n = self.n[name] chnl = self.chnl[name] with tf.variable_scope(name): self.prune[name] = tf.Variable(np.ones( (n, 1, 1, chnl), dtype=np.float32), trainable=False, name='prune') self.prune['ph' + name] = tf.placeholder( tf.float32, shape=[n, 1, 1, chnl]) self.prune['asn' + name] = tf.assign( self.prune[name], self.prune['ph' + name]) prune = tf.split(self.prune[self.block[blk]], n) logger.info(name) for i in range(n): with tf.variable_scope('unit' + str(i), reuse=reuse): if blk != 0 and i == 0: # no additional paras and computations shortcut shortcut = tf.nn.avg_pool( shortcut, [1, 2, 2, 1], [1, 2, 2, 1], 'SAME') shortcut = tf.concat( [shortcut, shortcut * 0.], 3) net = shortcut * prune[i] net = slim.conv2d( net, int(chnl / self.rate)) else: net = net * prune[i] net = slim.conv2d( net, int(chnl / self.rate)) net = slim.conv2d(net, chnl, activation_fn=None) net = net * prune[i] shortcut = shortcut + net shortcut = tf.nn.relu(shortcut) net = shortcut net = tf.reduce_mean(shortcut, [1, 2], keep_dims=False, name='pool') # ep['pool'] = net logit = slim.fully_connected(net, self.num_classes, activation_fn=None, normalizer_fn=None, scope='fc') return logit
def main(): args = parser.parse_args() # Parse original info from the experiment root and add new ones. args_file = os.path.join(args.experiment_root, 'args.json') if not os.path.isfile(args_file): raise IOError('`args.json` not found in {}'.format(args_file)) print('Loading args from {}.'.format(args_file)) with open(args_file, 'r') as f: args_resumed = json.load(f) for key, value in args_resumed.items(): if key not in args.__dict__: args.__dict__[key] = value # Load the config for the dataset. with open(args.dataset_config, 'r') as f: dataset_config = json.load(f) # Compute the label to color map id_to_rgb = np.asarray(dataset_config['rgb_colors'] + [(0, 0, 0)], dtype=np.uint8)[:, ::-1] # If we map from original labels to train labels we have to invert this. original_to_train_mapping = dataset_config.get('original_to_train_mapping', None) if original_to_train_mapping is None: # This results in an identity mapping. train_to_label_id = np.arange(len(id_to_rgb) - 1, dtype=np.uint8) else: train_to_label_id = np.arange(len(id_to_rgb) - 1, dtype=np.uint8) for label_id, label_train in enumerate(original_to_train_mapping): if label_train != -1: train_to_label_id[label_train] = label_id # Setup the input data. image_files, label_files = utils.load_dataset(args.eval_set, args.rgb_input_root, args.full_res_label_root) images = tf.data.Dataset.from_tensor_slices(image_files) labels = tf.data.Dataset.from_tensor_slices(label_files) dataset = tf.data.Dataset.zip((images, labels)) dataset = dataset.map(lambda x, y: tf_utils.string_tuple_to_image_pair( x, y, original_to_train_mapping), num_parallel_calls=args.loading_threads) dataset = tf.data.Dataset.zip((dataset, labels)) # Scale the input images dataset = dataset.map(lambda x, y: (((x[0] - 128.0) / 128.0, x[1]), y)) dataset = dataset.batch(args.batch_size) # Overlap producing and consuming for parallelism. dataset = dataset.prefetch(1) # Since we repeat the data infinitely, we only need a one-shot iterator. (image_batch, label_batch ), label_name_batch = dataset.make_one_shot_iterator().get_next() # Setup the network. model = import_module('networks.' + args.model_type) with tf.name_scope('model'): net = model.network(image_batch, is_training=False, **args.model_params) logits = slim.conv2d( net, len(dataset_config['class_names']), [3, 3], scope='output_conv', activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), biases_initializer=tf.zeros_initializer()) predictions = tf.nn.softmax(logits) with tf.Session() as sess: # Determine the checkpoint location. checkpoint_loader = tf.train.Saver() if args.checkpoint_iteration == -1: # The default TF way to do this fails when moving folders. checkpoint = os.path.join( args.experiment_root, 'checkpoint-{}'.format(args.train_iterations)) else: checkpoint = os.path.join( args.experiment_root, 'checkpoint-{}'.format(args.checkpoint_iteration)) iteration = int(checkpoint.split('-')[-1]) print('Restoring from checkpoint: {}'.format(checkpoint)) checkpoint_loader.restore(sess, checkpoint) # Setup storage if needed. result_directory = os.path.join(args.experiment_root, 'results-{}'.format(iteration)) if (not os.path.isdir(result_directory) and args.save_predictions is not 'none'): os.makedirs(result_directory) # Initialize the evaluation. evaluation = confusion.Confusion(dataset_config['class_names']) # Loop over image batches. for start_idx in count(step=args.batch_size): try: print('\rEvaluating batch {}-{}/{}'.format( start_idx, start_idx + args.batch_size, len(image_files)), flush=True, end='') preds_batch, gt_batch, gt_fn_batch = sess.run( [predictions, label_batch, label_name_batch]) for pred, gt, gt_fn in zip(preds_batch, gt_batch, gt_fn_batch): # Compute the scores. pred_full = np.argmax(cv2.resize(pred, gt.shape[:2][::-1]), -1) evaluation.incremental_update(gt.squeeze(), pred_full) # Possibly save result images. if args.save_predictions == 'full': pred_out = id_to_rgb[pred_full] if args.save_predictions == 'out': pred_out = id_to_rgb[np.argmax(pred, -1)] if args.save_predictions == 'full_id': pred_out = train_to_label_id[pred_full] if args.save_predictions == 'out_id': pred_out = train_to_label_id[np.argmax(pred, -1)] if args.save_predictions != 'none': out_filename = gt_fn.decode("utf-8").replace( args.full_res_label_root, result_directory) base_dir = os.path.dirname(out_filename) if not os.path.isdir(base_dir): os.makedirs(base_dir) cv2.imwrite(out_filename, pred_out) except tf.errors.OutOfRangeError: print() # Done! break # Print the evaluation. evaluation.print_confusion_matrix() # Save the results. result_file = os.path.join(args.experiment_root, 'results.json') try: with open(result_file, 'r') as f: result_log = json.load(f) except (FileNotFoundError, json.JSONDecodeError): result_log = {} result_log[str(iteration)] = { # json keys cannot be integers. 'confusion matrix' : evaluation.confusion_normalized_row.tolist(), 'iou scores' : evaluation.iou_score.tolist(), 'class scores' : evaluation.class_score.tolist(), 'global score' : evaluation.global_score, 'mean iou score' : evaluation.avg_iou_score, 'mean class score' : evaluation.avg_score, } with open(result_file, 'w') as f: json.dump(result_log, f, ensure_ascii=False, indent=2, sort_keys=True)
def _recognition_network(self, sampler=None, log_likelihood_func=None): """x values -> samples from Q and return log Q(h|x).""" samples = {} reuse = None if not self.run_recognition_network else True # Set defaults if sampler is None: sampler = self._random_sample if log_likelihood_func is None: log_likelihood_func = lambda sample, log_params: ( U.binary_log_likelihood(sample['activation'], log_params)) logQ = [] if self.hparams.task in ['sbn', 'omni']: # Initialize the edge case samples[-1] = {'activation': self._x} if self.mean_xs is not None: samples[-1]['activation'] -= self.mean_xs # center the input samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0 with slim.arg_scope([slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[Q_COLLECTION]): for i in xrange(self.hparams.n_layer): # Set up the input to the layer input = 2.0*samples[i-1]['activation'] - 1.0 # Create the conditional distribution (output is the logits) h = self._create_transformation(input, n_output=self.hparams.n_hidden, reuse=reuse, scope_prefix='q_%d' % i) samples[i] = sampler(h, self.uniform_samples[i], i) logQ.append(log_likelihood_func(samples[i], h)) self.run_recognition_network = True return logQ, samples elif self.hparams.task == 'sp': # Initialize the edge case samples[-1] = {'activation': tf.split(self._x, num_or_size_splits=2, axis=1)[0]} # top half of digit if self.mean_xs is not None: samples[-1]['activation'] -= np.split(self.mean_xs, 2, 0)[0] # center the input samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0 with slim.arg_scope([slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[Q_COLLECTION]): for i in xrange(self.hparams.n_layer): # Set up the input to the layer input = 2.0*samples[i-1]['activation'] - 1.0 # Create the conditional distribution (output is the logits) h = self._create_transformation(input, n_output=self.hparams.n_hidden, reuse=reuse, scope_prefix='q_%d' % i) samples[i] = sampler(h, self.uniform_samples[i], i) logQ.append(log_likelihood_func(samples[i], h)) self.run_recognition_network = True return logQ, samples
def create_model( self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, expansion=2, groups=None, #mask=None, drop_rate=0.5, gating_reduction=None, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = FLAGS.sample_random_frames if sample_random_frames is None else sample_random_frames cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden_size = hidden_size or FLAGS.nextvlad_hidden_size groups = groups or FLAGS.groups gating_reduction = gating_reduction or FLAGS.gating_reduction num_frames_exp = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames_exp, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames_exp, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] #reshaped_input = tf.reshape(model_input, [-1, feature_size]) #tf.summary.histogram("input_hist", reshaped_input) mask = tf.sequence_mask(num_frames, max_frames, dtype=tf.float32) input = slim.fully_connected( model_input, expansion * feature_size, activation_fn=None, weights_initializer=slim.variance_scaling_initializer()) attention = slim.fully_connected( model_input, groups, activation_fn=tf.nn.sigmoid, weights_initializer=slim.variance_scaling_initializer()) if mask is not None: attention = tf.multiply(attention, tf.expand_dims(mask, -1)) attention = tf.reshape(attention, [-1, max_frames * groups, 1]) tf.summary.histogram("sigmoid_attention", attention) reduce_size = expansion * feature_size // groups cluster_weights = tf.get_variable( "cluster_weights", [expansion * feature_size, groups * cluster_size], initializer=slim.variance_scaling_initializer()) # tf.summary.histogram("cluster_weights", cluster_weights) reshaped_input = tf.reshape(input, [-1, expansion * feature_size]) activation = tf.matmul(reshaped_input, cluster_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn", fused=False) activation = tf.reshape(activation, [-1, max_frames * groups, cluster_size]) activation = tf.nn.softmax(activation, axis=-1) activation = tf.multiply(activation, attention) # tf.summary.histogram("cluster_output", activation) a_sum = tf.reduce_sum(activation, -2, keep_dims=True) cluster_weights2 = tf.get_variable( "cluster_weights2", [1, reduce_size, cluster_size], initializer=slim.variance_scaling_initializer()) a = tf.multiply(a_sum, cluster_weights2) activation = tf.transpose(activation, perm=[0, 2, 1]) reshaped_input = tf.reshape(input, [-1, max_frames * groups, reduce_size]) vlad = tf.matmul(activation, reshaped_input) vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.subtract(vlad, a) vlad = tf.nn.l2_normalize(vlad, 1) vlad = tf.reshape(vlad, [-1, cluster_size * reduce_size]) vlad = slim.batch_norm(vlad, center=True, scale=True, is_training=is_training, scope="vlad_bn", fused=False) if drop_rate > 0.: vlad = slim.dropout(vlad, keep_prob=1. - drop_rate, is_training=is_training, scope="vlad_dropout") vlad_dim = vlad.get_shape().as_list()[1] print("VLAD dimension", vlad_dim) hidden_weights = tf.get_variable( "hidden_weights", [vlad_dim, hidden_size], initializer=slim.variance_scaling_initializer()) activation = tf.matmul(vlad, hidden_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden_bn", fused=False) activation = tf.nn.relu(activation, name='embedding1') gating_weights_1 = tf.get_variable( "gating_weights_1", [hidden_size, hidden_size // gating_reduction], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(activation, gating_weights_1) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, activation_fn=slim.nn.relu, scope="gating_bn") gating_weights_2 = tf.get_variable( "gating_weights_2", [hidden_size // gating_reduction, hidden_size], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(gates, gating_weights_2) gates = tf.sigmoid(gates) tf.summary.histogram("final_gates", gates) activation = tf.multiply(activation, gates, name="embedding2") aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def layer(val, num_outputs, name, act_fun=None, kernel_initializer=slim.variance_scaling_initializer(factor=1.0 / 3.0, mode='FAN_IN', uniform=True), layer_norm=False, batch_norm=False, phase=None, dropout=False, rate=None): """Create a fully-connected layer. Parameters ---------- val : tf.Variable the input to the layer num_outputs : int number of outputs from the layer name : str the scope of the layer act_fun : tf.nn.* or None the activation function kernel_initializer : Any the initializing operation to the weights of the layer layer_norm : bool whether to enable layer normalization batch_norm : bool whether to enable batch normalization phase : tf.compat.v1.placeholder a placeholder that defines whether training is occurring for the batch normalization layer. Set to True in training and False in testing. dropout : bool whether to enable dropout rate : tf.compat.v1.placeholder the probability that each element is dropped if dropout is implemented Returns ------- tf.Variable the output from the layer """ val = tf.layers.dense(val, num_outputs, name=name, kernel_initializer=kernel_initializer) if layer_norm: val = tf.contrib.layers.layer_norm(val, center=True, scale=True) if batch_norm: val = tf.contrib.layers.batch_norm( val, center=True, scale=True, is_training=phase, scope='bn_{}'.format(name), ) if act_fun is not None: val = act_fun(val) if dropout: val = tf.nn.dropout(val, rate=rate) return val
def _generator_network(self, samples, logQ, log_likelihood_func=None): '''Returns learning signal and function. This is the implementation for SBNs for the ELBO. Args: samples: dictionary of sampled latent variables logQ: list of log q(h_i) terms log_likelihood_func: function used to compute log probs for the latent variables Returns: learning_signal: the "reward" function function_term: part of the function that depends on the parameters and needs to have the gradient taken through ''' reuse = None if not self.run_generator_network else True if self.hparams.task in ['sbn', 'omni']: if log_likelihood_func is None: log_likelihood_func = lambda sample, log_params: ( U.binary_log_likelihood(sample['activation'], log_params)) logPPrior = log_likelihood_func(samples[self.hparams.n_layer - 1], tf.expand_dims(self.prior, 0)) with slim.arg_scope( [slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[P_COLLECTION]): for i in reversed(xrange(self.hparams.n_layer)): if i == 0: n_output = self.hparams.n_input else: n_output = self.hparams.n_hidden input = 2.0 * samples[i]['activation'] - 1.0 h = self._create_transformation(input, n_output, reuse=reuse, scope_prefix='p_%d' % i) if i == 0: # Assume output is binary logP = U.binary_log_likelihood(self._x, h + self.train_bias) else: logPPrior += log_likelihood_func(samples[i - 1], h) self.run_generator_network = True return logP + logPPrior - tf.add_n(logQ), logP + logPPrior elif self.hparams.task == 'sp': with slim.arg_scope( [slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[P_COLLECTION]): n_output = int(self.hparams.n_input / 2) i = self.hparams.n_layer - 1 # use the last layer input = 2.0 * samples[i]['activation'] - 1.0 h = self._create_transformation(input, n_output, reuse=reuse, scope_prefix='p_%d' % i) # Predict on the lower half of the image logP = U.binary_log_likelihood( tf.split(self._x, num_or_size_splits=2, axis=1)[1], h + np.split(self.train_bias, 2, 0)[1]) self.run_generator_network = True return logP, logP
def _build_graph(self): hidden1_size = self.NetVLADHiddenSize gating_reduction = 8 model_input = tf.concat( [self.input_video_RGB_feature, self.input_video_Audio_feature], -1) # [batch,max_frame,1024+128] mask = tf.sequence_mask(self.input_rgb_audio_true_frame, 300, dtype=tf.float32) max_frames = model_input.get_shape().as_list()[1] video_nextvlad = NeXtVLAD(1024, max_frames, self.cluster_size, self.is_training, groups=self.groups, expansion=self.expansion) audio_nextvlad = NeXtVLAD(128, max_frames, self.cluster_size // 2, self.is_training, groups=self.groups // 2, expansion=self.expansion) with tf.variable_scope("video_VLAD"): vlad_video = video_nextvlad.forward(model_input[:, :, 0:1024], mask=mask) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_nextvlad.forward(model_input[:, :, 1024:], mask=mask) vlad = tf.concat([vlad_video, vlad_audio], 1) vlad = slim.dropout(vlad, keep_prob=self.dropout_keep_prob, is_training=self.is_training, scope="vlad_dropout") vlad_dim = vlad.get_shape().as_list()[1] print("VLAD dimension", vlad_dim) hidden1_weights = tf.get_variable( "hidden1_weights", [vlad_dim, hidden1_size], initializer=slim.variance_scaling_initializer()) activation = tf.matmul(vlad, hidden1_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="hidden1_bn", fused=False) gating_weights_1 = tf.get_variable( "gating_weights_1", [hidden1_size, hidden1_size // gating_reduction], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(activation, gating_weights_1) gates = slim.batch_norm(gates, center=True, scale=True, is_training=self.is_training, activation_fn=slim.nn.relu, scope="gating_bn") gating_weights_2 = tf.get_variable( "gating_weights_2", [hidden1_size // gating_reduction, hidden1_size], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(gates, gating_weights_2) gates = tf.sigmoid(gates) tf.summary.histogram("final_gates", gates) activation = tf.multiply(activation, gates) l2_penalty = 1e-8 with tf.variable_scope("output_cate1"): self.cate1_logits = slim.fully_connected( activation, len(self.youtu_8m_cate1_dict), activation_fn=None, weights_regularizer=slim.l2_regularizer(l2_penalty), biases_regularizer=slim.l2_regularizer(l2_penalty), weights_initializer=slim.variance_scaling_initializer()) self.cate1_probs = tf.nn.sigmoid(self.cate1_logits) self.cate1_top5_probs_value, self.cate1_top5_probs_index = tf.nn.top_k( self.cate1_probs, 5) # self.total_loss=self.calculate_loss(predictions=self.logits,labels=self.input_cate2_multilabel) self.cate1_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=self.input_cate1_multilabel, logits=self.cate1_logits, name="cate2_cross_loss") self.mean_cate1_loss = tf.reduce_mean(self.cate1_loss) self.cate1_embeddings = tf.cast(self.youtu_8m_cate1_embedding, dtype=tf.float32) with tf.variable_scope('attention'): self.U = tf.tanh( tc.layers.fully_connected(self.cate1_embeddings, num_outputs=512, activation_fn=None, biases_initializer=None) + tc.layers.fully_connected(tf.expand_dims(activation, 1), num_outputs=512, activation_fn=None)) self.first_logits = tc.layers.fully_connected(self.U, num_outputs=1, activation_fn=None) self.first_scores = tf.nn.softmax(self.first_logits, 1) # [batch,] self.cate1_embeddings_attention = tf.reduce_sum( self.cate1_embeddings * self.first_scores, axis=1) # [batch,max_len,2h] with tf.variable_scope("output_cate2"): self.cate2_logits = slim.fully_connected( tf.concat([activation, self.cate1_embeddings_attention], -1), 3862, activation_fn=None, weights_regularizer=slim.l2_regularizer(l2_penalty), biases_regularizer=slim.l2_regularizer(l2_penalty), weights_initializer=slim.variance_scaling_initializer()) self.cate2_probs = tf.nn.sigmoid(self.cate2_logits) self.cate2_top20_probs_value, self.cate2_top20_probs_index = tf.nn.top_k( self.cate2_probs, 20) self.cate2_top40_probs_value, self.cate2_top40_probs_index = tf.nn.top_k( self.cate2_probs, 40) # self.total_loss=self.calculate_loss(predictions=self.logits,labels=self.input_cate2_multilabel) self.cate2_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=self.input_cate2_multilabel, logits=self.cate2_logits, name="cate2_cross_loss") self.mean_cate2_loss = tf.reduce_mean(self.cate2_loss) self.total_loss = self.mean_cate1_loss + 2 * self.mean_cate2_loss
def build_bisenet(self, reuse=False): """ Builds the BiSeNet model. Arguments: reuse: Reuse variable or not Returns: BiSeNet model """ ### The spatial path ### The number of feature maps for each convolution is not specified in the paper ### It was chosen here to be equal to the number of feature maps of a classification ### model at each corresponding stage batch_norm_params = self.model_config['batch_norm_params'] init_method = self.model_config['conv_config']['init_method'] if init_method == 'kaiming_normal': initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False) else: initializer = slim.xavier_initializer() with tf.variable_scope('spatial_net', reuse=reuse): with slim.arg_scope([slim.conv2d], biases_initializer=None, weights_initializer=initializer): with slim.arg_scope([slim.batch_norm], is_training=self.is_training(), **batch_norm_params): spatial_net = ConvBlock(self.images, n_filters=64, kernel_size=[7, 7], strides=2) spatial_net = ConvBlock(spatial_net, n_filters=64, kernel_size=[3, 3], strides=2) spatial_net = ConvBlock(spatial_net, n_filters=64, kernel_size=[3, 3], strides=2) spatial_net = ConvBlock(spatial_net, n_filters=128, kernel_size=[1, 1]) frontend_config = self.model_config['frontend_config'] ### Context path logits, end_points, frontend_scope, init_fn = frontend_builder.build_frontend( self.images, frontend_config, self.is_training(), reuse) ### Combining the paths with tf.variable_scope('combine_path', reuse=reuse): with slim.arg_scope([slim.conv2d], biases_initializer=None, weights_initializer=initializer): with slim.arg_scope([slim.batch_norm], is_training=self.is_training(), **batch_norm_params): # tail part size = tf.shape(end_points['pool5'])[1:3] print('111111111111111', end_points['pool5']) exit() global_context = tf.reduce_mean(end_points['pool5'], [1, 2], keep_dims=True) global_context = slim.conv2d(global_context, 128, 1, [1, 1], activation_fn=None) global_context = tf.nn.relu( slim.batch_norm(global_context, fused=True)) global_context = tf.image.resize_bilinear(global_context, size=size) net_5 = AttentionRefinementModule(end_points['pool5'], n_filters=128) net_4 = AttentionRefinementModule(end_points['pool4'], n_filters=128) net_5 = tf.add(net_5, global_context) net_5 = Upsampling(net_5, scale=2) net_5 = ConvBlock(net_5, n_filters=128, kernel_size=[3, 3]) net_4 = tf.add(net_4, net_5) net_4 = Upsampling(net_4, scale=2) net_4 = ConvBlock(net_4, n_filters=128, kernel_size=[3, 3]) context_net = net_4 net = FeatureFusionModule(input_1=spatial_net, input_2=context_net, n_filters=256) net_5 = ConvBlock(net_5, n_filters=128, kernel_size=[3, 3]) net_4 = ConvBlock(net_4, n_filters=128, kernel_size=[3, 3]) net = ConvBlock(net, n_filters=64, kernel_size=[3, 3]) # Upsampling + dilation or only Upsampling net = Upsampling(net, scale=2) net = slim.conv2d(net, 64, [3, 3], rate=2, activation_fn=tf.nn.relu, biases_initializer=None, normalizer_fn=slim.batch_norm) net = slim.conv2d(net, self.num_classes, [1, 1], activation_fn=None, scope='logits') self.net = Upsampling(net, 4) # net = slim.conv2d(net, self.num_classes, [1, 1], activation_fn=None, scope='logits') # self.net = Upsampling(net, scale=8) if self.mode in ['train', 'validation', 'test']: sup1 = slim.conv2d(net_5, self.num_classes, [1, 1], activation_fn=None, scope='supl1') sup2 = slim.conv2d(net_4, self.num_classes, [1, 1], activation_fn=None, scope='supl2') self.sup1 = Upsampling(sup1, scale=16) self.sup2 = Upsampling(sup2, scale=8) self.init_fn = init_fn
def forward(self, input, mask=None): input = slim.fully_connected( input, self.expansion * self.feature_size, activation_fn=None, weights_initializer=slim.variance_scaling_initializer()) attention = slim.fully_connected( input, self.groups, activation_fn=tf.nn.sigmoid, weights_initializer=slim.variance_scaling_initializer()) if mask is not None: attention = tf.multiply(attention, tf.expand_dims(mask, -1)) attention = tf.reshape(attention, [-1, self.max_frames * self.groups, 1]) tf.summary.histogram("sigmoid_attention", attention) feature_size = self.expansion * self.feature_size // self.groups cluster_weights = tf.get_variable( "cluster_weights", [ self.expansion * self.feature_size, self.groups * self.cluster_size ], initializer=slim.variance_scaling_initializer()) # tf.summary.histogram("cluster_weights", cluster_weights) reshaped_input = tf.reshape(input, [-1, self.expansion * self.feature_size]) activation = tf.matmul(reshaped_input, cluster_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn", fused=False) activation = tf.reshape( activation, [-1, self.max_frames * self.groups, self.cluster_size]) activation = tf.nn.softmax(activation, axis=-1) activation = tf.multiply(activation, attention) # tf.summary.histogram("cluster_output", activation) a_sum = tf.reduce_sum(activation, -2, keepdims=True) cluster_weights2 = tf.get_variable( "cluster_weights2", [1, feature_size, self.cluster_size], initializer=slim.variance_scaling_initializer()) a = tf.multiply(a_sum, cluster_weights2) activation = tf.transpose(activation, perm=[0, 2, 1]) reshaped_input = tf.reshape( input, [-1, self.max_frames * self.groups, feature_size]) vlad = tf.matmul(activation, reshaped_input) vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.subtract(vlad, a) vlad = tf.nn.l2_normalize(vlad, 1) vlad = tf.reshape(vlad, [-1, self.cluster_size * feature_size]) vlad = slim.batch_norm(vlad, center=True, scale=True, is_training=self.is_training, scope="vlad_bn", fused=False) return vlad
def head(endpoints, embedding_dim, is_training): batch_norm_params = { 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'updates_collections': tf.GraphKeys.UPDATE_OPS, 'fused': None, } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(0.0), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): masked_maps = [] projection_conv = slim.conv2d(endpoints['resnet_v2_50/block4'], 512, [1, 1], scope='projection_conv') attention_block4_conv1 = slim.conv2d(endpoints['resnet_v2_50/block4'], 64, [1, 1], scope='attention_block4_conv1') attention_block4_conv2 = slim.conv2d(attention_block4_conv1, 1, [1, 1], scope='attention_block4_conv2') attention_block4_mask = tf.sigmoid(attention_block4_conv2) masked_maps.append(projection_conv * attention_block4_mask) attention_block3_conv1 = slim.conv2d(endpoints['resnet_v2_50/block3'], 64, [1, 1], scope='attention_block3_conv1') attention_block3_conv2 = slim.conv2d(attention_block3_conv1, 1, [1, 1], scope='attention_block3_conv2') attention_block3_mask = tf.sigmoid(attention_block3_conv2) masked_maps.append(projection_conv * attention_block3_mask) attention_block2_conv1 = slim.conv2d(endpoints['resnet_v2_50/block2'], 64, [1, 1], scope='attention_block2_conv1') attention_block2_conv2 = slim.conv2d(attention_block2_conv1, 1, [1, 1], scope='attention_block2_conv2') attention_block2_pool = slim.max_pool2d(attention_block2_conv2, [2, 2], scope='attention_block2_pool') attention_block2_mask = tf.sigmoid(attention_block2_pool) masked_maps.append(projection_conv * attention_block2_mask) attention_block1_conv1 = slim.conv2d(endpoints['resnet_v2_50/block1'], 64, [1, 1], scope='attention_block1_conv1') attention_block1_pool1 = slim.max_pool2d(attention_block1_conv1, [2, 2], scope='attention_block1_pool1') attention_block1_conv2 = slim.conv2d(attention_block1_pool1, 1, [1, 1], scope='attention_block1_conv2') attention_block1_pool2 = slim.max_pool2d(attention_block1_conv2, [2, 2], scope='attention_block2_pool2') attention_block1_mask = tf.sigmoid(attention_block1_pool2) masked_maps.append(projection_conv * attention_block1_mask) endpoints['attention_mask_block1'] = attention_block1_mask endpoints['attention_mask_block2'] = attention_block2_mask endpoints['attention_mask_block3'] = attention_block3_mask endpoints['attention_mask_block4'] = attention_block4_mask _masked = tf.concat(masked_maps, 3) endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean( _masked, [1, 2], name='_pool5', keep_dims=False) endpoints['head_output'] = slim.fully_connected( endpoints['model_output'], 1024, normalizer_fn=slim.batch_norm, normalizer_params={ 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS, }) endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected( endpoints['head_output'], embedding_dim, activation_fn=None, weights_initializer=tf.orthogonal_initializer(), scope='emb') return endpoints
def conv_layer(val, filters, kernel_size, strides, name, act_fun=None, kernel_initializer=slim.variance_scaling_initializer( factor=1.0 / 3.0, mode='FAN_IN', uniform=True), layer_norm=False, batch_norm=False, phase=None, dropout=False, rate=None): """Create a convolutional layer. Parameters ---------- val : tf.Variable the input to the layer filters : int the number of channels in the convolutional kernel kernel_size : int or list of int the height and width of the convolutional filter strides : int or list of int the strides in each direction of convolution name : str the scope of the layer act_fun : tf.nn.* or None the activation function kernel_initializer : Any the initializing operation to the weights of the layer layer_norm : bool whether to enable layer normalization batch_norm : bool whether to enable batch normalization phase : tf.compat.v1.placeholder a placeholder that defines whether training is occurring for the batch normalization layer. Set to True in training and False in testing. dropout : bool whether to enable dropout rate : tf.compat.v1.placeholder the probability that each element is dropped if dropout is implemented Returns ------- tf.Variable the output from the layer """ val = tf.layers.conv2d(val, filters, kernel_size, strides=strides, padding='same', name=name, kernel_initializer=kernel_initializer) if layer_norm: val = tf.contrib.layers.layer_norm(val, center=True, scale=True) if batch_norm: val = tf.contrib.layers.batch_norm( val, center=True, scale=True, is_training=phase, scope='bn_{}'.format(name), ) if act_fun is not None: val = act_fun(val) if dropout: val = tf.nn.dropout(val, rate=rate) return val
def _generator_network(self, samples, logQ, log_likelihood_func=None): '''Returns learning signal and function. This is the implementation for SBNs for the ELBO. Args: samples: dictionary of sampled latent variables logQ: list of log q(h_i) terms log_likelihood_func: function used to compute log probs for the latent variables Returns: learning_signal: the "reward" function function_term: part of the function that depends on the parameters and needs to have the gradient taken through ''' reuse=None if not self.run_generator_network else True if self.hparams.task in ['sbn', 'omni']: if log_likelihood_func is None: log_likelihood_func = lambda sample, log_params: ( U.binary_log_likelihood(sample['activation'], log_params)) logPPrior = log_likelihood_func( samples[self.hparams.n_layer-1], tf.expand_dims(self.prior, 0)) with slim.arg_scope([slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[P_COLLECTION]): for i in reversed(xrange(self.hparams.n_layer)): if i == 0: n_output = self.hparams.n_input else: n_output = self.hparams.n_hidden input = 2.0*samples[i]['activation']-1.0 h = self._create_transformation(input, n_output, reuse=reuse, scope_prefix='p_%d' % i) if i == 0: # Assume output is binary logP = U.binary_log_likelihood(self._x, h + self.train_bias) else: logPPrior += log_likelihood_func(samples[i-1], h) self.run_generator_network = True return logP + logPPrior - tf.add_n(logQ), logP + logPPrior elif self.hparams.task == 'sp': with slim.arg_scope([slim.fully_connected], weights_initializer=slim.variance_scaling_initializer(), variables_collections=[P_COLLECTION]): n_output = int(self.hparams.n_input/2) i = self.hparams.n_layer - 1 # use the last layer input = 2.0*samples[i]['activation']-1.0 h = self._create_transformation(input, n_output, reuse=reuse, scope_prefix='p_%d' % i) # Predict on the lower half of the image logP = U.binary_log_likelihood(tf.split(self._x, num_or_size_splits=2, axis=1)[1], h + np.split(self.train_bias, 2, 0)[1]) self.run_generator_network = True return logP, logP
def atari_network(num_actions, num_atoms, support, network_type, state, representation_layer=10): """The convolutional network used to compute agent's Q-value distributions. Args: num_actions: int, number of actions. num_atoms: int, the number of buckets of the value function distribution. support: tf.linspace, the support of the Q-value distribution. network_type: namedtuple, collection of expected values to return. state: `tf.Tensor`, contains the agent's current state. representation_layer: int, the layer which will be used as the representation for computing the bisimulation distances. Defaults to a high value, which defaults to the penultimate layer. Returns: net: _network_type object containing the tensors output by the network. """ weights_initializer = contrib_slim.variance_scaling_initializer( factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) curr_layer = 1 net = tf.cast(state, tf.float32) net = tf.div(net, 255.) representation = None if representation_layer <= curr_layer: representation = contrib_slim.flatten(net) net = contrib_slim.conv2d(net, 32, [8, 8], stride=4, weights_initializer=weights_initializer, trainable=False) curr_layer += 1 if representation is None and representation_layer <= curr_layer: representation = contrib_slim.flatten(net) net = contrib_slim.conv2d(net, 64, [4, 4], stride=2, weights_initializer=weights_initializer, trainable=False) curr_layer += 1 if representation is None and representation_layer <= curr_layer: representation = contrib_slim.flatten(net) net = contrib_slim.conv2d(net, 64, [3, 3], stride=1, weights_initializer=weights_initializer, trainable=False) net = contrib_slim.flatten(net) curr_layer += 1 if representation is None and representation_layer <= curr_layer: representation = net net = contrib_slim.fully_connected(net, 512, weights_initializer=weights_initializer, trainable=False) curr_layer += 1 if representation is None: representation = net net = contrib_slim.fully_connected(net, num_actions * num_atoms, activation_fn=None, weights_initializer=weights_initializer, trainable=False) logits = tf.reshape(net, [-1, num_actions, num_atoms]) probabilities = contrib_layers.softmax(logits) q_values = tf.reduce_sum(support * probabilities, axis=2) return network_type(q_values, logits, probabilities, representation)
def construct_network(frame_input, root_tags, reuse, is_training, title_input, desc_input, ocr_input, cate_input): """ :param frame_input: :param tags_input: :param reuse: :param is_training: :return: """ with tf.variable_scope('text', reuse=reuse) as scope: with tf.device("/cpu:0"), tf.variable_scope('dict'): # word_embedding = tf.get_variable('initW', [vocab_size, embed_size], trainable=True) title_raw = tf.nn.embedding_lookup(word_embed, title_input) desc_raw = tf.nn.embedding_lookup(word_embed, desc_input) ocr_raw = tf.nn.embedding_lookup(word_embed, ocr_input) cate_raw = tf.nn.embedding_lookup(word_embed, cate_input) with tf.variable_scope("conv"): def txt_conv(t_input, d_input, conv_w, name): text = tf.concat([t_input, d_input], axis=1) conv = tf.layers.conv1d(text, filters=num_filter, kernel_size=conv_w, name=name) conv = slim.batch_norm(conv, decay=0.9997, epsilon=0.001, is_training=is_training) conv = tf.reduce_max(conv, reduction_indices=[1], name='global_pool_title_desc') return conv rep_2 = txt_conv(title_raw, desc_raw, 2, 'conv2') rep_3 = txt_conv(title_raw, desc_raw, 3, 'conv3') rep_4 = txt_conv(title_raw, desc_raw, 4, 'conv4') rep_5 = txt_conv(title_raw, desc_raw, 5, 'conv5') rep_cate_2 = txt_conv(cate_raw, ocr_raw, 2, 'conv2_1') rep_cate_3 = txt_conv(cate_raw, ocr_raw, 2, 'conv3_1') rep_cate_4 = txt_conv(cate_raw, ocr_raw, 2, 'conv4_1') rep_cate_5 = txt_conv(cate_raw, ocr_raw, 2, 'conv5_1') rep = tf.concat([rep_2, rep_3, rep_4, rep_5], 1) rep_cate = tf.concat( [rep_cate_2, rep_cate_3, rep_cate_4, rep_cate_5], 1) text_logits_1 = tf.layers.dense(rep, 256) # 512 text_logits_2 = tf.layers.dense(rep_cate, 256) text_logits = tf.concat([text_logits_1, text_logits_2], 1) with tf.variable_scope("transformer"): with tf.variable_scope('preprocess', reuse=reuse) as scope: frame_position_embeddings = tf.get_variable( name='frame_position_embedding', shape=[text_length, ATTENTION_EMBED_DIM], initializer=tf.truncated_normal_initializer(stddev=0.02)) frame_parts = tf.layers.conv1d(tf.concat([title_raw, desc_raw], axis=1), filters=ATTENTION_EMBED_DIM, kernel_size=1, name='frame_feat_squeeze') frame_parts = slim.batch_norm(frame_parts, decay=0.9997, epsilon=0.001, is_training=is_training) frame_parts += frame_position_embeddings intermediate_size = 512 hidden_size = ATTENTION_EMBED_DIM initializer_range = 0.02 hidden_dropout_prob = 0.2 prev_output = frame_parts for layer_idx in range(FLAGS.attention_layer_num): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.variable_scope("attention"): with tf.variable_scope("self"): attention_head = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=None, num_attention_heads=4, size_per_head=64, attention_probs_dropout_prob=0.2, initializer_range=0.02, do_return_2d_tensor=False, batch_size=batch_size, from_seq_length=text_length, to_seq_length=text_length) attention_output = attention_head # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer( initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = slim.batch_norm( attention_output + layer_input, decay=0.9997, epsilon=0.001, is_training=is_training) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=tf.nn.relu, kernel_initializer=create_initializer( initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer( initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = slim.batch_norm(layer_output + attention_output, decay=0.9997, epsilon=0.001, is_training=is_training) prev_output = layer_output attention_final = tf.reduce_max(prev_output, [1], keep_dims=False, name='reduce_max') # 256 with tf.variable_scope('NeXtVLAD', reuse=reuse) as scope: # re_d = 512 # frame_input_1 = tf.layers.dense(frame_input, re_d, activation=tf.nn.relu, name='re_d') video_nextvlad = NeXtVLAD(FRAME_FEAT_DIM, FRAME_FEAT_LEN, FLAGS.nextvlad_cluster_size, is_training, groups=FLAGS.groups, expansion=FLAGS.expansion) vlad = video_nextvlad.forward(frame_input, mask=None) vlad = slim.dropout(vlad, keep_prob=1. - FLAGS.vlad_drop_rate, is_training=is_training, scope="vlad_dropout") # SE context gating vlad_dim = vlad.get_shape().as_list()[1] # print("VLAD dimension", vlad_dim) hidden1_weights = tf.get_variable( "hidden1_weights", [vlad_dim, FLAGS.nextvlad_hidden_size], initializer=slim.variance_scaling_initializer()) activation = tf.matmul(vlad, hidden1_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn", fused=False) # activation = tf.nn.relu(activation) gating_weights_1 = tf.get_variable( "gating_weights_1", [ FLAGS.nextvlad_hidden_size, FLAGS.nextvlad_hidden_size // FLAGS.gating_reduction ], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(activation, gating_weights_1) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, activation_fn=slim.nn.relu, scope="gating_bn") gating_weights_2 = tf.get_variable( "gating_weights_2", [ FLAGS.nextvlad_hidden_size // FLAGS.gating_reduction, FLAGS.nextvlad_hidden_size ], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(gates, gating_weights_2) gates = tf.sigmoid(gates) vlad_activation = tf.multiply(activation, gates) # vlad_activation = vlad with tf.variable_scope('frame', reuse=reuse) as scope: # layer 1 (batch * 200 * 1024) nets_frame = tf.layers.conv1d(frame_input, filters=1024, kernel_size=3, name='conv1d_1') nets_frame = slim.batch_norm(nets_frame, decay=0.9997, epsilon=0.001, is_training=is_training) nets_frame = tf.nn.relu(nets_frame) nets_frame = tf.layers.max_pooling1d(nets_frame, pool_size=2, strides=2, name='pool1d_1') # layer 2 nets_frame = tf.layers.conv1d(nets_frame, filters=256, kernel_size=5, name='conv1d_2') nets_frame = slim.batch_norm(nets_frame, decay=0.9997, epsilon=0.001, is_training=is_training) nets_frame = tf.nn.relu(nets_frame) # layer 3 nets_frame = tf.layers.conv1d(nets_frame, filters=256, kernel_size=5, name='conv1d_3') nets_frame = slim.batch_norm(nets_frame, decay=0.9997, epsilon=0.001, is_training=is_training) nets_frame = tf.nn.relu(nets_frame) # 91 * 256 # max pooling layer nets_frame = tf.layers.max_pooling1d(nets_frame, pool_size=4, strides=4, name='pool1d_2') # test flat nets_frame = tf.layers.flatten(nets_frame) # 5632 = 22 * 256 # nets_frame = tf.reduce_max(nets_frame, reduction_indices=[1], name='max_pool') fc_frame = tf.layers.dense(nets_frame, 512, name='fc1') # 512 # fc_frame = tf.nn.l2_normalize(fc_frame, dim=1) with tf.variable_scope('predict', reuse=reuse) as scope: video_vector = tf.concat( [fc_frame, text_logits, attention_final, vlad_activation], axis=1) # 1280 video_vector = tf.layers.dropout(video_vector, drop_rate, training=is_training) video_vector = tf.nn.relu(video_vector) video_vector = tf.layers.dense(video_vector, 512, name='dense_layer_3') total_vector = slim.batch_norm(video_vector, decay=0.9997, epsilon=0.001, is_training=is_training) tf.check_numerics(video_vector, 'video_vector is inf or nan') # -- root predict with tf.variable_scope('root_se_cg', reuse=reuse) as scope: root_vector = se_context_gate(total_vector, is_training=is_training, se_hidden_size=512) predict_root = tf.layers.dense(root_vector, TAG_NUM, name='pred_root') predict_root_label = tf.argmax(predict_root, dimension=-1) predict_root_confidence = tf.nn.softmax(predict_root, name='conf_root') cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=predict_root, labels=root_tags) loss_root = tf.reduce_mean(cross_entropy) L2_frame = tf.Variable(initial_value=0., trainable=False, dtype=tf.float32) L2_text = tf.Variable(initial_value=0., trainable=False, dtype=tf.float32) L2_w2v = tf.Variable(initial_value=0., trainable=False, dtype=tf.float32) for w in tl.layers.get_variables_with_name('frame', True, True): L2_frame += tf.contrib.layers.l2_regularizer(1.0)(w) for w in tl.layers.get_variables_with_name('predict', True, True): L2_frame += tf.contrib.layers.l2_regularizer(1.0)(w) for w in tl.layers.get_variables_with_name('NeXtVLAD', True, True): L2_frame += tf.contrib.layers.l2_regularizer(1.0)(w) for w in tl.layers.get_variables_with_name('text', True, True): L2_text += tf.contrib.layers.l2_regularizer(1.0)(w) if FLAGS.train_w2v: for w in tl.layers.get_variables_with_name('initW', True, True): L2_w2v += tf.contrib.layers.l2_regularizer(1.0)(w) cost = FLAGS.root_weight * loss_root + FLAGS.frame_weight * L2_frame + \ FLAGS.text_weight * L2_text + FLAGS.w2v_weight * L2_w2v result = dict() result['loss_root'] = loss_root result['cost'] = cost result['predict_root'] = predict_root result['predict_label_root'] = predict_root_label result['confidence_root'] = predict_root_confidence result['L2_frame'] = L2_frame result['L2_text'] = L2_text result['L2_w2v'] = L2_w2v return result
def head(endpoints, embedding_dim, is_training): batch_norm_params = { 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'updates_collections': tf.GraphKeys.UPDATE_OPS, 'fused': None, } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(0.0), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): masks = [] masked_maps = [] for i in range(head_num): attention_branch_mask = attention_branch( endpoints['resnet_v2_50/block4'], i) masks.append(attention_branch_mask) masked_map = (1 + attention_branch_mask ) * endpoints['resnet_v2_50/block4'] endpoints['attention_map{}'.format(i)] = masked_map masked_maps.append(masked_map) endpoints['attention_masks'] = masks mbd_collect = [] for i in range(head_num): for j in range(i + 1, head_num): js_div = js_divergence(masks[i], masks[j], 'constraint_{}{}'.format(i, j)) mbd_collect.append(js_div) endpoints['MBD_Constraint'] = tf.add_n(mbd_collect, name='MBD_Constraint') _masked = tf.concat(masked_maps, axis=3, name='concat_mask') endpoints['masked'] = _masked endpoints['model_output'] = endpoints['global_pool'] = tf.reduce_mean( _masked, [1, 2], name='_pool5', keep_dims=False) endpoints['head_output'] = slim.fully_connected( endpoints['model_output'], 1024, normalizer_fn=slim.batch_norm, normalizer_params={ 'decay': 0.9, 'epsilon': 1e-5, 'scale': True, 'is_training': is_training, 'updates_collections': tf.GraphKeys.UPDATE_OPS, }) endpoints['emb'] = endpoints['emb_raw'] = slim.fully_connected( endpoints['head_output'], embedding_dim, activation_fn=None, weights_initializer=tf.orthogonal_initializer(), scope='emb') return endpoints