def _conv_block(self, inputs, numOut, name = 'conv_block'): """ Convolutional Block Args: inputs : Input Tensor numOut : Desired output number of channel name : Name of the block Returns: conv_3 : Output Tensor """ if self.tiny: with tf.name_scope(name): norm = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = self.training) pad = tf.pad(norm, np.array([[0,0],[1,1],[1,1],[0,0]]), name= 'pad') conv = self._conv(pad, int(numOut), kernel_size=3, strides=1, pad = 'VALID', name= 'conv') return conv else: with tf.name_scope(name): with tf.name_scope('norm_1'): norm_1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = self.training) conv_1 = self._conv(norm_1, int(numOut/2), kernel_size=1, strides=1, pad = 'VALID', name= 'conv') with tf.name_scope('norm_2'): norm_2 = tf.contrib.layers.batch_norm(conv_1, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = self.training) pad = tf.pad(norm_2, np.array([[0,0],[1,1],[1,1],[0,0]]), name= 'pad') conv_2 = self._conv(pad, int(numOut/2), kernel_size=3, strides=1, pad = 'VALID', name= 'conv') with tf.name_scope('norm_3'): norm_3 = tf.contrib.layers.batch_norm(conv_2, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = self.training) conv_3 = self._conv(norm_3, int(numOut), kernel_size=1, strides=1, pad = 'VALID', name= 'conv') return conv_3
def __build(self): self.__init_global_epoch() self.__init_global_step() self.__init_input() with tf.name_scope('Preprocessing'): red, green, blue = tf.split(self.X, num_or_size_splits=3, axis=3) preprocessed_input = tf.concat([ tf.subtract(blue, ShuffleNet.MEAN[0]) * ShuffleNet.NORMALIZER, tf.subtract(green, ShuffleNet.MEAN[1]) * ShuffleNet.NORMALIZER, tf.subtract(red, ShuffleNet.MEAN[2]) * ShuffleNet.NORMALIZER, ], 3) x_padded = tf.pad(preprocessed_input, [[0, 0], [1, 1], [1, 1], [0, 0]], "CONSTANT") conv1 = conv2d('conv1', x=x_padded, w=None, num_filters=self.output_channels['conv1'], kernel_size=(3, 3), stride=(2, 2), l2_strength=self.args.l2_strength, bias=self.args.bias, batchnorm_enabled=self.args.batchnorm_enabled, is_training=self.is_training, activation=tf.nn.relu, padding='VALID') padded = tf.pad(conv1, [[0, 0], [0, 1], [0, 1], [0, 0]], "CONSTANT") max_pool = max_pool_2d(padded, size=(3, 3), stride=(2, 2), name='max_pool') stage2 = self.__stage(max_pool, stage=2, repeat=3) stage3 = self.__stage(stage2, stage=3, repeat=7) stage4 = self.__stage(stage3, stage=4, repeat=3) global_pool = avg_pool_2d(stage4, size=(7, 7), stride=(1, 1), name='global_pool', padding='VALID') logits_unflattened = conv2d('fc', global_pool, w=None, num_filters=self.args.num_classes, kernel_size=(1, 1), l2_strength=self.args.l2_strength, bias=self.args.bias, is_training=self.is_training) self.logits = flatten(logits_unflattened) self.__init_output()
def _tf_pad(x, szs, padding='SYMMETRIC'): """ Tensorflow can't handle padding by more than the dimension of the image. This wrapper allows us to build padding up successively. """ def get_size(x): # Often the batch will be None. Convert these to 0s x_szs = x.get_shape().as_list() x_szs = [0 if val is None else val for val in x_szs] return x_szs x_szs = get_size(x) gt = [[sz[0] > x_sz, sz[1] > x_sz] for sz,x_sz in zip(szs, x_szs)] while np.any(gt): # This creates an intermediate padding amount that will bring in # dimensions that are too big by the size of x. szs_step = np.int32(gt) * np.stack([x_szs, x_szs], axis=-1) x = tf.pad(x, szs_step, padding) szs = szs - szs_step x_szs = get_size(x) gt = [[sz[0] > x_sz, sz[1] > x_sz] for sz,x_sz in zip(szs, x_szs)] # Pad by the remaining amount x = tf.pad(x, szs, 'SYMMETRIC') return x
def res_block(x, a=None, filter_size=16, nonlinearity=concat_elu, keep_p=1.0, stride=1, gated=False, name="resnet"): orig_x = x print(orig_x.get_shape()) x_1 = conv_layer(nonlinearity(x), 3, stride, filter_size, name + '_conv_1') if a is not None: shape_a = int_shape(a) shape_x_1 = int_shape(x_1) a = tf.pad( a, [[0, 0], [0, shape_x_1[1]-shape_a[1]], [0, shape_x_1[2]-shape_a[2]], [0, 0]]) x_1 += nin(nonlinearity(a), filter_size, name + '_nin') x_1 = nonlinearity(x_1) if keep_p < 1.0: x_1 = tf.nn.dropout(x_1, keep_prob=keep_p) if not gated: x_2 = conv_layer(x_1, 3, 1, filter_size, name + '_conv_2') else: x_2 = conv_layer(x_1, 3, 1, filter_size*2, name + '_conv_2') x_2_1, x_2_2 = tf.split(3,2,x_2) x_2 = x_2_1 * tf.nn.sigmoid(x_2_2) if int(orig_x.get_shape()[2]) > int(x_2.get_shape()[2]): assert(int(orig_x.get_shape()[2]) == 2*int(x_2.get_shape()[2]), "res net block only supports stirde 2") orig_x = tf.nn.avg_pool(orig_x, [1,2,2,1], [1,2,2,1], padding='SAME') # pad it out_filter = filter_size in_filter = int(orig_x.get_shape()[3]) if out_filter != in_filter: orig_x = tf.pad( orig_x, [[0, 0], [0, 0], [0, 0], [(out_filter-in_filter), 0]]) return orig_x + x_2
def generator(img, scope, gf_dim=64, reuse=False, train=True): bn = functools.partial(slim.batch_norm, scale=True, is_training=train, decay=0.9, epsilon=1e-5, updates_collections=None) def residule_block(x, dim, scope='res'): y = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]], "REFLECT") y = relu(instance_norm(conv(y, dim, 3, 1, padding='VALID', scope=scope + '_conv1'), scope=scope + '_instance_norm1')) y = tf.pad(y, [[0, 0], [1, 1], [1, 1], [0, 0]], "REFLECT") y = instance_norm(conv(y, dim, 3, 1, padding='VALID', scope=scope + '_conv2'), scope=scope + '_instance_norm2') return y + x with tf.variable_scope(scope + '_generator', reuse=reuse): c0 = tf.pad(img, [[0, 0], [3, 3], [3, 3], [0, 0]], "REFLECT") c1 = relu(instance_norm(conv(c0, gf_dim, 7, 1, padding='VALID', scope='c1_conv'), scope='c1_instance_norm')) c2 = relu(instance_norm(conv(c1, gf_dim * 2, 3, 2, scope='c2_conv'), scope='c2_instance_norm')) c3 = relu(instance_norm(conv(c2, gf_dim * 4, 3, 2, scope='c3_conv'), scope='c3_instance_norm')) r1 = residule_block(c3, gf_dim * 4, scope='r1') r2 = residule_block(r1, gf_dim * 4, scope='r2') r3 = residule_block(r2, gf_dim * 4, scope='r3') r4 = residule_block(r3, gf_dim * 4, scope='r4') r5 = residule_block(r4, gf_dim * 4, scope='r5') r6 = residule_block(r5, gf_dim * 4, scope='r6') r7 = residule_block(r6, gf_dim * 4, scope='r7') r8 = residule_block(r7, gf_dim * 4, scope='r8') r9 = residule_block(r8, gf_dim * 4, scope='r9') d1 = relu(instance_norm(deconv(r9, gf_dim * 2, 3, 2, scope='d1_dconv'), scope='d1_instance_norm')) d2 = relu(instance_norm(deconv(d1, gf_dim, 3, 2, scope='d2_dconv'), scope='d2_instance_norm')) d2 = tf.pad(d2, [[0, 0], [3, 3], [3, 3], [0, 0]], "REFLECT") pred = conv(d2, 3, 7, 1, padding='VALID', scope='pred_conv') pred = tf.nn.tanh(pred) return pred
def build_graph(self, image, label): xys = np.array([(y, x, 1) for y in range(WARP_TARGET_SIZE) for x in range(WARP_TARGET_SIZE)], dtype='float32') xys = tf.constant(xys, dtype=tf.float32, name='xys') # p x 3 image = image / 255.0 - 0.5 # bhw2 def get_stn(image): stn = (LinearWrap(image) .AvgPooling('downsample', 2) .Conv2D('conv0', 20, 5, padding='VALID') .MaxPooling('pool0', 2) .Conv2D('conv1', 20, 5, padding='VALID') .FullyConnected('fc1', 32) .FullyConnected('fct', 6, activation=tf.identity, kernel_initializer=tf.constant_initializer(), bias_initializer=tf.constant_initializer([1, 0, HALF_DIFF, 0, 1, HALF_DIFF]))()) # output 6 parameters for affine transformation stn = tf.reshape(stn, [-1, 2, 3], name='affine') # bx2x3 stn = tf.reshape(tf.transpose(stn, [2, 0, 1]), [3, -1]) # 3 x (bx2) coor = tf.reshape(tf.matmul(xys, stn), [WARP_TARGET_SIZE, WARP_TARGET_SIZE, -1, 2]) coor = tf.transpose(coor, [2, 0, 1, 3], 'sampled_coords') # b h w 2 sampled = BilinearSample('warp', [image, coor], borderMode='constant') return sampled with argscope([Conv2D, FullyConnected], activation=tf.nn.relu): with tf.variable_scope('STN1'): sampled1 = get_stn(image) with tf.variable_scope('STN2'): sampled2 = get_stn(image) # For visualization in tensorboard with tf.name_scope('visualization'): padded1 = tf.pad(sampled1, [[0, 0], [HALF_DIFF, HALF_DIFF], [HALF_DIFF, HALF_DIFF], [0, 0]]) padded2 = tf.pad(sampled2, [[0, 0], [HALF_DIFF, HALF_DIFF], [HALF_DIFF, HALF_DIFF], [0, 0]]) img_orig = tf.concat([image[:, :, :, 0], image[:, :, :, 1]], 1) # b x 2h x w transform1 = tf.concat([padded1[:, :, :, 0], padded1[:, :, :, 1]], 1) transform2 = tf.concat([padded2[:, :, :, 0], padded2[:, :, :, 1]], 1) stacked = tf.concat([img_orig, transform1, transform2], 2, 'viz') tf.summary.image('visualize', tf.expand_dims(stacked, -1), max_outputs=30) sampled = tf.concat([sampled1, sampled2], 3, 'sampled_concat') logits = (LinearWrap(sampled) .FullyConnected('fc1', 256, activation=tf.nn.relu) .FullyConnected('fc2', 128, activation=tf.nn.relu) .FullyConnected('fct', 19, activation=tf.identity)()) tf.nn.softmax(logits, name='prob') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = tf.to_float(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), name='incorrect_vector') summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error')) wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') summary.add_moving_summary(cost, wd_cost) return tf.add_n([wd_cost, cost], name='cost')
def build_generator_resnet_6blocks(inputgen, name="generator"): with tf.variable_scope(name): f = 7 ks = 3 pad_input = tf.pad(inputgen,[[0, 0], [ks, ks], [ks, ks], [0, 0]], "REFLECT") o_c1 = general_conv2d(pad_input, ngf, f, f, 1, 1, 0.02,name="c1") o_c2 = general_conv2d(o_c1, ngf*2, ks, ks, 2, 2, 0.02,"SAME","c2") o_c3 = general_conv2d(o_c2, ngf*4, ks, ks, 2, 2, 0.02,"SAME","c3") o_r1 = build_resnet_block(o_c3, ngf*4, "r1") o_r2 = build_resnet_block(o_r1, ngf*4, "r2") o_r3 = build_resnet_block(o_r2, ngf*4, "r3") o_r4 = build_resnet_block(o_r3, ngf*4, "r4") o_r5 = build_resnet_block(o_r4, ngf*4, "r5") o_r6 = build_resnet_block(o_r5, ngf*4, "r6") o_c4 = general_deconv2d(o_r6, [batch_size,64,64,ngf*2], ngf*2, ks, ks, 2, 2, 0.02,"SAME","c4") o_c5 = general_deconv2d(o_c4, [batch_size,128,128,ngf], ngf, ks, ks, 2, 2, 0.02,"SAME","c5") o_c5_pad = tf.pad(o_c5,[[0, 0], [ks, ks], [ks, ks], [0, 0]], "REFLECT") o_c6 = general_conv2d(o_c5_pad, img_layer, f, f, 1, 1, 0.02,"VALID","c6",do_relu=False) # Adding the tanh layer out_gen = tf.nn.tanh(o_c6,"t1") return out_gen
def Rk(input, k, reuse=False, norm='instance', is_training=True, name=None): """ A residual block that contains two 3x3 convolutional layers with the same number of filters on both layer Args: input: 4D Tensor k: integer, number of filters (output depth) reuse: boolean name: string Returns: 4D tensor (same shape as input) """ with tf.variable_scope(name, reuse=reuse): with tf.variable_scope('layer1', reuse=reuse): weights1 = _weights("weights1", shape=[3, 3, input.get_shape()[3], k]) padded1 = tf.pad(input, [[0,0],[1,1],[1,1],[0,0]], 'REFLECT') conv1 = tf.nn.conv2d(padded1, weights1, strides=[1, 1, 1, 1], padding='VALID') normalized1 = _norm(conv1, is_training, norm) relu1 = tf.nn.relu(normalized1) with tf.variable_scope('layer2', reuse=reuse): weights2 = _weights("weights2", shape=[3, 3, relu1.get_shape()[3], k]) padded2 = tf.pad(relu1, [[0,0],[1,1],[1,1],[0,0]], 'REFLECT') conv2 = tf.nn.conv2d(padded2, weights2, strides=[1, 1, 1, 1], padding='VALID') normalized2 = _norm(conv2, is_training, norm) output = input+normalized2 return output
def resnet_fpn_backbone(image, num_blocks, freeze_c2=True): shape2d = tf.shape(image)[2:] mult = float(cfg.FPN.RESOLUTION_REQUIREMENT) new_shape2d = tf.to_int32(tf.ceil(tf.to_float(shape2d) / mult) * mult) pad_shape2d = new_shape2d - shape2d assert len(num_blocks) == 4, num_blocks with resnet_argscope(): chan = image.shape[1] pad_base = maybe_reverse_pad(2, 3) l = tf.pad(image, tf.stack( [[0, 0], [0, 0], [pad_base[0], pad_base[1] + pad_shape2d[0]], [pad_base[0], pad_base[1] + pad_shape2d[1]]])) l.set_shape([None, chan, None, None]) l = Conv2D('conv0', l, 64, 7, strides=2, activation=BNReLU, padding='VALID') l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)]) l = MaxPooling('pool0', l, 3, strides=2, padding='VALID') c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1) if freeze_c2: c2 = tf.stop_gradient(c2) c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2) c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2) c5 = resnet_group('group3', c4, resnet_bottleneck, 512, num_blocks[3], 2) # 32x downsampling up to now # size of c5: ceil(input/32) return c2, c3, c4, c5
def _residual_v1(self, x, kernel_size, in_filter, out_filter, stride, activate_before_residual=False): """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" del activate_before_residual with tf.name_scope('residual_v1') as name_scope: orig_x = x x = self._conv(x, kernel_size, out_filter, stride) x = self._batch_norm(x) x = self._relu(x) x = self._conv(x, kernel_size, out_filter, 1) x = self._batch_norm(x) if in_filter != out_filter: orig_x = self._avg_pool(orig_x, stride, stride) pad = (out_filter - in_filter) // 2 if self._data_format == 'channels_first': orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) else: orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) x = self._relu(tf.add(x, orig_x)) tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) return x
def fixed_padding(inputs, kernel_size, data_format="channels_first"): """Pads the input along the spatial dimensions independently of input size. Args: inputs: `Tensor` of size `[batch, channels, height, width]` or `[batch, height, width, channels]` depending on `data_format`. kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d` operations. Should be a positive integer. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A padded `Tensor` of the same `data_format` with size either intact (if `kernel_size == 1`) or padded (if `kernel_size > 1`). """ pad_total = kernel_size - 1 pad_beg = pad_total // 2 pad_end = pad_total - pad_beg if data_format == "channels_first": padded_inputs = tf.pad( inputs, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) else: padded_inputs = tf.pad( inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) return padded_inputs
def add_edge_padding(x, filter_size): assert filter_size[0] % 2 == 1 if filter_size[0] == 1 and filter_size[1] == 1: return x a = (filter_size[0] - 1) // 2 # vertical padding size b = (filter_size[1] - 1) // 2 # horizontal padding size if True: x = tf.pad(x, [[0, 0], [a, a], [b, b], [0, 0]]) name = "_".join([str(dim) for dim in [a, b, *int_shape(x)[1:3]]]) pads = tf.get_collection(name) if not pads: if hvd.rank() == 0: print("Creating pad", name) pad = np.zeros([1] + int_shape(x)[1:3] + [1], dtype='float32') pad[:, :a, :, 0] = 1. pad[:, -a:, :, 0] = 1. pad[:, :, :b, 0] = 1. pad[:, :, -b:, 0] = 1. pad = tf.convert_to_tensor(pad) tf.add_to_collection(name, pad) else: pad = pads[0] pad = tf.tile(pad, [tf.shape(x)[0], 1, 1, 1]) x = tf.concat([x, pad], axis=3) else: pad = tf.pad(tf.zeros_like(x[:, :, :, :1]) - 1, [[0, 0], [a, a], [b, b], [0, 0]]) + 1 x = tf.pad(x, [[0, 0], [a, a], [b, b], [0, 0]]) x = tf.concat([x, pad], axis=3) return x
def _conv(self, x, kernel_size, filters, strides, is_atrous=False): """Convolution.""" padding = 'SAME' if not is_atrous and strides > 1: pad = kernel_size - 1 pad_beg = pad // 2 pad_end = pad - pad_beg if self._data_format == 'channels_first': x = tf.pad( x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) else: x = tf.pad( x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) padding = 'VALID' return tf.layers.conv2d( inputs=x, kernel_size=kernel_size, filters=filters, strides=strides, padding=padding, use_bias=False, data_format=self._data_format)
def pad_to_same_length(x, y, final_length_divisible_by=1, axis=1): """Pad tensors x and y on axis 1 so that they have the same length.""" if axis not in [1, 2]: raise ValueError("Only axis=1 and axis=2 supported for now.") with tf.name_scope("pad_to_same_length", [x, y]): x_length = tf.shape(x)[axis] y_length = tf.shape(y)[axis] max_length = tf.maximum(x_length, y_length) if final_length_divisible_by > 1: # Find the nearest larger-or-equal integer divisible by given number. max_length += final_length_divisible_by - 1 max_length //= final_length_divisible_by max_length *= final_length_divisible_by length_diff1 = max_length - x_length length_diff2 = max_length - y_length def padding_list(length_diff, arg): if axis == 1: return [[[0, 0], [0, length_diff]], tf.zeros([tf.rank(arg) - 2, 2], dtype=tf.int32)] return [[[0, 0], [0, 0], [0, length_diff]], tf.zeros([tf.rank(arg) - 3, 2], dtype=tf.int32)] paddings1 = tf.concat(padding_list(length_diff1, x), axis=0) paddings2 = tf.concat(padding_list(length_diff2, y), axis=0) res_x = tf.pad(x, paddings1) res_y = tf.pad(y, paddings2) # Static shapes are the same except for axis=1. x_shape = x.shape.as_list() x_shape[axis] = None res_x.set_shape(x_shape) y_shape = y.shape.as_list() y_shape[axis] = None res_y.set_shape(y_shape) return res_x, res_y
def fixed_padding(inputs, kernel_size, data_format, conv_time_dim): """Pads the input along the spatial dimensions independently of input size. Args: inputs: A tensor of size [batch, channels, height_in, width_in] or [batch, height_in, width_in, channels] depending on data_format. kernel_size: The kernel to be used in the conv2d or max_pool2d operation. Should be a positive integer. data_format: The input format ('channels_last' or 'channels_first'). Returns: A tensor with the same format as the input with the data either intact (if kernel_size == 1) or padded (if kernel_size > 1). """ pad_total = kernel_size - 1 feature_pad_beg = pad_total // 2 feature_pad_end = pad_total - feature_pad_beg if conv_time_dim: time_pad_beg = 0 time_pad_end = 0 else: time_pad_beg = feature_pad_beg time_pad_end = feature_pad_end if data_format == 'channels_first': padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [time_pad_beg, time_pad_end], [feature_pad_beg, feature_pad_end]]) else: padded_inputs = tf.pad(inputs, [[0, 0], [time_pad_beg, time_pad_end], [feature_pad_end, feature_pad_end], [0, 0]]) return padded_inputs
def random_transformation2(x, y, padding, phase_train, rnd_vflip=True, rnd_hflip=True, rnd_transpose=True, rnd_colour=False): """ Perform random crop, flip, transpose, hue, saturation, brightness, contrast. Args: x: [B, H, W, 3] y: [B, T, H, W] padding: int phase_train: bool """ # Random image transformation layers. phase_train_f = tf.to_float(phase_train) x_shape = tf.shape(x) y_shape = tf.shape(y) num_ex = x_shape[0] inp_height = x_shape[1] inp_width = x_shape[2] inp_depth_x = x_shape[3] inp_depth_y = y_shape[3] # Add padding x_pad = tf.pad(x, [[0, 0], [padding, padding], [padding, padding], [0, 0]]) y_pad = tf.pad(y, [[0, 0], [padding, padding], [padding, padding], [0, 0]]) # Random crop offset = tf.random_uniform([2], dtype='int32', maxval=padding * 2) x_rand = tf.slice(x_pad, tf.pack([0, offset[0], offset[1], 0]), tf.pack([-1, inp_height, inp_width, inp_depth_x])) y_rand = tf.slice(y_pad, tf.pack([0, offset[0], offset[1], 0]), tf.pack([-1, inp_height, inp_width, inp_depth_y])) # Center slices (for inference) x_ctr = tf.slice(x_pad, [0, padding, padding, 0], tf.pack([-1, inp_height, inp_width, -1])) y_ctr = tf.slice(y_pad, [0, padding, padding, 0], tf.pack([-1, inp_height, inp_width, -1])) # Random horizontal & vertical flip & transpose rand_h = tf.random_uniform([1], 1.0 - float(rnd_hflip), 1.0) rand_v = tf.random_uniform([1], 1.0 - float(rnd_vflip), 1.0) mirror = tf.pack([1.0, rand_v[0], rand_h[0], 1.0]) < 0.5 x_rand = tf.reverse(x_rand, mirror) y_rand = tf.reverse(y_rand, mirror) rand_t = tf.random_uniform([1], 1.0 - float(rnd_transpose), 1.0) do_tr = tf.cast(rand_t[0] < 0.5, 'int32') x_rand = tf.transpose(x_rand, tf.pack([0, 1 + do_tr, 2 - do_tr, 3])) y_rand = tf.transpose(y_rand, tf.pack([0, 1 + do_tr, 2 - do_tr, 3])) # Random hue, saturation, brightness, contrast if rnd_colour: x_rand = random_hue(x_rand, 0.1) x_rand = random_saturation(x_rand, 0.9, 1.1) x_rand = tf.image.random_brightness(x_rand, 0.1) x_rand = tf.image.random_contrast(x_rand, 0.9, 1.1) x = (1.0 - phase_train_f) * x_ctr + phase_train_f * x_rand y = (1.0 - phase_train_f) * y_ctr + phase_train_f * y_rand return x, y
def build_network(self, images, num_outputs, alpha, keep_prob=0.5, is_training=True, scope='yolo'): with tf.variable_scope(scope): with slim.arg_scope( [slim.conv2d, slim.fully_connected], activation_fn=leaky_relu(alpha), weights_regularizer=slim.l2_regularizer(0.0005), weights_initializer=tf.truncated_normal_initializer(0.0, 0.01) ): net = tf.pad( images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]), name='pad_1') net = slim.conv2d(net, 64, 7, 2, padding='VALID', scope='conv_2') net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3') net = slim.conv2d(net, 192, 3, scope='conv_4') net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5') net = slim.conv2d(net, 128, 1, scope='conv_6') net = slim.conv2d(net, 256, 3, scope='conv_7') net = slim.conv2d(net, 256, 1, scope='conv_8') net = slim.conv2d(net, 512, 3, scope='conv_9') net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10') net = slim.conv2d(net, 256, 1, scope='conv_11') net = slim.conv2d(net, 512, 3, scope='conv_12') net = slim.conv2d(net, 256, 1, scope='conv_13') net = slim.conv2d(net, 512, 3, scope='conv_14') net = slim.conv2d(net, 256, 1, scope='conv_15') net = slim.conv2d(net, 512, 3, scope='conv_16') net = slim.conv2d(net, 256, 1, scope='conv_17') net = slim.conv2d(net, 512, 3, scope='conv_18') net = slim.conv2d(net, 512, 1, scope='conv_19') net = slim.conv2d(net, 1024, 3, scope='conv_20') net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21') net = slim.conv2d(net, 512, 1, scope='conv_22') net = slim.conv2d(net, 1024, 3, scope='conv_23') net = slim.conv2d(net, 512, 1, scope='conv_24') net = slim.conv2d(net, 1024, 3, scope='conv_25') net = slim.conv2d(net, 1024, 3, scope='conv_26') net = tf.pad(net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),name='pad_27') net = slim.conv2d(net, 1024, 3, 2, padding='VALID', scope='conv_28') net = slim.conv2d(net, 1024, 3, scope='conv_29') net = slim.conv2d(net, 1024, 3, scope='conv_30') net = tf.transpose(net, [0, 3, 1, 2], name='trans_31') net = slim.flatten(net, scope='flat_32') net = slim.fully_connected(net, 512, scope='fc_33') net = slim.fully_connected(net, 4096, scope='fc_34') net = slim.dropout(net, keep_prob=keep_prob, is_training=is_training,scope='dropout_35') net = slim.fully_connected(net, num_outputs, activation_fn=None, scope='fc_36') return net
def prepare_decoder(targets, hparams): """Prepare decoder for images.""" targets_shape = common_layers.shape_list(targets) channels = hparams.num_channels curr_infer_length = None # during training, images are [batch, IMG_LEN, IMG_LEN, 3]. # At inference, they are [batch, curr_infer_length, 1, 1] if hparams.mode == tf.contrib.learn.ModeKeys.INFER: curr_infer_length = targets_shape[1] if hparams.block_raster_scan: assert hparams.img_len*channels % hparams.query_shape[1] == 0 assert hparams.img_len % hparams.query_shape[0] == 0 total_block_width = hparams.img_len*channels # Decoding is in block raster scan order. We divide the image into # hparams.query_shape blocks and then decode each block in raster scan. # To make that compatible with our inference pipeline, pad the target so # that rows is a multiple of query_shape and columns is a multiple of # hparams.img_len*channels curr_infer_length = targets_shape[1] block_padding_factor = total_block_width * hparams.query_shape[0] targets = tf.pad(targets, [ [0, 0], [0, -curr_infer_length % block_padding_factor], [0, 0], [0, 0]]) num_blocks = total_block_width // hparams.query_shape[1] # Reshape the image to represent blocks target_blocks = tf.reshape( targets, [targets_shape[0], -1, num_blocks, hparams.query_shape[0], hparams.query_shape[1]]) # Transpose to read the image in 2D fashion. targets = tf.transpose(target_blocks, [0, 1, 3, 2, 4]) else: # add padding to make sure the size of targets is a multiple of img_height # times number of channels. This is needed for positional encodings and # for doing the RGB lookup. padding_factor = channels * hparams.img_len targets = tf.pad(targets, [ [0, 0], [0, -curr_infer_length % padding_factor], [0, 0], [0, 0]]) targets = tf.reshape(targets, [targets_shape[0], -1, hparams.img_len, channels]) # Preprocess image x = prepare_image(targets, hparams, name="dec_channels") x_shape = common_layers.shape_list(x) if (hparams.dec_attention_type == AttentionType.LOCAL_2D or hparams.dec_attention_type == AttentionType.LOCAL_BLOCK): x = common_attention.right_shift_blockwise(x, hparams.query_shape) x = add_pos_signals(x, hparams, "dec_pos") else: # Add position signals x = tf.reshape(x, [targets_shape[0], x_shape[1]*x_shape[2], hparams.hidden_size]) x = common_layers.shift_right_3d(x) x = tf.reshape(x, [targets_shape[0], x_shape[1], x_shape[2], hparams.hidden_size]) x = add_pos_signals(x, hparams, "dec_pos") x = common_layers.cast_like(x, targets) return x, x_shape[1], x_shape[2]
def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose', pad_mode='REFLECT', align_corners=False): """Upsamples the given inputs. Args: net: A Tensor of size [batch_size, height, width, filters]. num_outputs: The number of output filters. stride: A list of 2 scalars or a 1x2 Tensor indicating the scale, relative to the inputs, of the output dimensions. For example, if kernel size is [2, 3], then the output height and width will be twice and three times the input size. method: The upsampling method: 'nn_upsample_conv', 'bilinear_upsample_conv', or 'conv2d_transpose'. pad_mode: mode for tf.pad, one of "CONSTANT", "REFLECT", or "SYMMETRIC". align_corners: option for method, 'bilinear_upsample_conv'. If true, the centers of the 4 corner pixels of the input and output tensors are aligned, preserving the values at the corner pixels. Returns: A Tensor which was upsampled using the specified method. Raises: ValueError: if `method` is not recognized. """ with tf.variable_scope('upconv'): net_shape = tf.shape(net) height = net_shape[1] width = net_shape[2] # Reflection pad by 1 in spatial dimensions (axes 1, 2 = h, w) to make a 3x3 # 'valid' convolution produce an output with the same dimension as the # input. spatial_pad_1 = np.array([[0, 0], [1, 1], [1, 1], [0, 0]]) if method == 'nn_upsample_conv': net = tf.image.resize_nearest_neighbor( net, [stride[0] * height, stride[1] * width]) net = tf.pad(net, spatial_pad_1, pad_mode) net = layers.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid') elif method == 'bilinear_upsample_conv': net = tf.image.resize_bilinear( net, [stride[0] * height, stride[1] * width], align_corners=align_corners) net = tf.pad(net, spatial_pad_1, pad_mode) net = layers.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid') elif method == 'conv2d_transpose': # This corrects 1 pixel offset for images with even width and height. # conv2d is left aligned and conv2d_transpose is right aligned for even # sized images (while doing 'SAME' padding). # Note: This doesn't reflect actual model in paper. net = layers.conv2d_transpose( net, num_outputs, kernel_size=[3, 3], stride=stride, padding='valid') net = net[:, 1:, 1:, :] else: raise ValueError('Unknown method: [%s]' % method) return net
def setup_actor_update(actor): with tf.variable_scope("rl"): actor.critic_output = tf.placeholder(tf.float32, [None, None, actor.vocab_size], name='critic_output') # action_gradients is passed in by Q_network... # and in DDPG, it's the gradients of Q w.r.t. policy's chosen actions # but in AC, it's the output of Q network w.r.t. all actions opt = nlc_model.get_optimizer(FLAGS.optimizer)(actor.learning_rate) # update params = tf.trainable_variables() # TODO: hope this would work with tf.variable_scope("Loss"): doshape = tf.shape(actor.decoder_output) T, batch_size = doshape[0], doshape[1] do2d = tf.reshape(actor.decoder_output, [-1, actor.size]) logits2d = rnn_cell._linear(do2d, actor.vocab_size, True, 1.0) # outputs2d = tf.nn.log_softmax(logits2d) # apply Q-network's score here (similar to advantage function) # 1. reshape critic_output like decoder_output (same shape anyway) # TODO: hope this is correct critic_do2d = tf.reshape(actor.critic_output, [-1, actor.vocab_size]) # should reshape according to critic # 2. multiply this with actor's logitis rl_logits2d = logits2d * critic_do2d # actor.outputs = tf.reshape(outputs2d, tf.pack([T, batch_size, actor.vocab_size])) targets_no_GO = tf.slice(actor.target_tokens, [1, 0], [-1, -1]) masks_no_GO = tf.slice(actor.target_mask, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(rl_logits2d, labels1d) * tf.to_float(mask1d) losses2d = tf.reshape(losses1d, tf.pack([T, batch_size])) actor.rl_losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size) # http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html (DDPG update) gradients = tf.gradients(actor.rl_losses, params) # step 7: update # Not sure if I understood this part lol clipped_gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.max_gradient_norm) # clip, then multiply, otherwise we are not learning the signals from critic # clipped_gradients: [T, batch_size, vocab_size] # updated_gradients = clipped_gradients * actor.critic_output # pass in as input actor.rl_gradient_norm = tf.global_norm(clipped_gradients) actor.rl_param_norm = tf.global_norm(params) actor.rl_updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=actor.global_step)
def convolutional(inputs, output_channels, filter_size, stride, padding, conv_type, scope, init='xavier', regularizer=None, data_format='NHWC', output_shape=None, spectral=False, power_iterations=1, display=True): with tf.variable_scope('conv_layer_%s' % scope): # Weight Initlializer. if init=='normal': weight_init = tf.initializers.random_normal(stddev=0.02) elif init=='orthogonal': weight_init = tf.initializers.orthogonal() else: weight_init = tf.contrib.layers.xavier_initializer_conv2d() # Shapes. current_shape = inputs.get_shape() input_channels = current_shape[3] if 'transpose'in conv_type or 'upscale' in conv_type: filter_shape = (filter_size, filter_size, output_channels, input_channels) else: filter_shape = (filter_size, filter_size, input_channels, output_channels) # Weight and Bias Initialization. bias = tf.get_variable(name='bias', shape=[output_channels], initializer=tf.constant_initializer(0.0), trainable=True, dtype=tf.float32) filter = tf.get_variable(name='filter_conv', shape=filter_shape, initializer=weight_init, trainable=True, dtype=tf.float32, regularizer=regularizer) # Type of convolutional operation. if conv_type == 'upscale': output_shape = [tf.shape(inputs)[0], current_shape[1]*2, current_shape[2]*2, output_channels] # Weight filter initializer. filter = tf.pad(filter, ([1,1], [1,1], [0,0], [0,0]), mode='CONSTANT') filter = tf.add_n([filter[1:,1:], filter[:-1,1:], filter[1:,:-1], filter[:-1,:-1]]) if spectral: filter = spectral_normalization(filter, power_iterations) strides = [1, 2, 2, 1] output = tf.nn.conv2d_transpose(value=inputs, filter=filter, output_shape=tf.stack(output_shape), strides=strides, padding=padding, data_format=data_format) elif conv_type == 'downscale': # Weight filter initializer. filter = tf.pad(filter, ([1,1], [1,1], [0,0], [0,0]), mode='CONSTANT') filter = tf.add_n([filter[1:,1:], filter[:-1,1:], filter[1:,:-1], filter[:-1,:-1]]) if spectral: filter = spectral_normalization(filter, power_iterations) strides = [1, 2, 2, 1] output = tf.nn.conv2d(input=inputs, filter=filter, strides=strides, padding=padding, data_format=data_format) elif conv_type == 'transpose': output_shape = [tf.shape(inputs)[0], current_shape[1]*stride, current_shape[2]*stride, output_channels] strides = [1, stride, stride, 1] if spectral: filter = spectral_normalization(filter, power_iterations) output = tf.nn.conv2d_transpose(value=inputs, filter=filter, output_shape=tf.stack(output_shape), strides=strides, padding=padding, data_format=data_format) elif conv_type == 'convolutional': strides = [1, stride, stride, 1] if spectral: filter = spectral_normalization(filter, power_iterations) output = tf.nn.conv2d(input=inputs, filter=filter, strides=strides, padding=padding, data_format=data_format) output = tf.nn.bias_add(output, bias, data_format=data_format) if display: print('Conv Layer: Scope=%15s Channels %5s Filter_size=%2s Stride=%2s Padding=%6s Conv_type=%15s Output Shape: %s' % (str(scope)[:14], output_channels, filter_size, stride, padding, conv_type, output.shape)) return output
def style_loss(CNN_structure, const_layers, var_layers, content_segs, style_segs, weight): loss_styles = [] layer_count = float(len(const_layers)) layer_index = 0 _, content_seg_height, content_seg_width, _ = content_segs[0].get_shape().as_list() _, style_seg_height, style_seg_width, _ = style_segs[0].get_shape().as_list() for layer_name in CNN_structure: layer_name = layer_name[layer_name.find("/") + 1:] # downsampling segmentation if "pool" in layer_name: content_seg_width, content_seg_height = int(math.ceil(content_seg_width / 2)), int(math.ceil(content_seg_height / 2)) style_seg_width, style_seg_height = int(math.ceil(style_seg_width / 2)), int(math.ceil(style_seg_height / 2)) for i in xrange(len(content_segs)): content_segs[i] = tf.image.resize_bilinear(content_segs[i], tf.constant((content_seg_height, content_seg_width))) style_segs[i] = tf.image.resize_bilinear(style_segs[i], tf.constant((style_seg_height, style_seg_width))) elif "conv" in layer_name: for i in xrange(len(content_segs)): # have some differences on border with torch content_segs[i] = tf.nn.avg_pool(tf.pad(content_segs[i], [[0, 0], [1, 1], [1, 1], [0, 0]], "CONSTANT"), \ ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1], padding='VALID') style_segs[i] = tf.nn.avg_pool(tf.pad(style_segs[i], [[0, 0], [1, 1], [1, 1], [0, 0]], "CONSTANT"), \ ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1], padding='VALID') if layer_name == var_layers[layer_index].name[var_layers[layer_index].name.find("/") + 1:]: print("Setting up style layer: <{}>".format(layer_name)) const_layer = const_layers[layer_index] var_layer = var_layers[layer_index] layer_index = layer_index + 1 layer_style_loss = 0.0 for content_seg, style_seg in zip(content_segs, style_segs): gram_matrix_const = gram_matrix(tf.multiply(const_layer, style_seg)) style_mask_mean = tf.reduce_mean(style_seg) gram_matrix_const = tf.cond(tf.greater(style_mask_mean, 0.), lambda: gram_matrix_const / (tf.to_float(tf.size(const_layer)) * style_mask_mean), lambda: gram_matrix_const ) gram_matrix_var = gram_matrix(tf.multiply(var_layer, content_seg)) content_mask_mean = tf.reduce_mean(content_seg) gram_matrix_var = tf.cond(tf.greater(content_mask_mean, 0.), lambda: gram_matrix_var / (tf.to_float(tf.size(var_layer)) * content_mask_mean), lambda: gram_matrix_var ) diff_style_sum = tf.reduce_mean(tf.squared_difference(gram_matrix_const, gram_matrix_var)) * content_mask_mean layer_style_loss += diff_style_sum loss_styles.append(layer_style_loss * weight) return loss_styles
def _pre_padding(self, x, kernel_size): """Padding Based On Kernel_size""" pad_total = kernel_size - 1 pad_beg = pad_total // 2 pad_end = pad_total - pad_beg if self.data_format == 'NCHW': x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) else: x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) return x
def build_resnet_block(inputres, dim, name="resnet"): with tf.variable_scope(name): out_res = tf.pad(inputres, [[0, 0], [1, 1], [1, 1], [0, 0]], "REFLECT") out_res = general_conv2d(out_res, dim, 3, 3, 1, 1, 0.02, "VALID","c1") out_res = tf.pad(out_res, [[0, 0], [1, 1], [1, 1], [0, 0]], "REFLECT") out_res = general_conv2d(out_res, dim, 3, 3, 1, 1, 0.02, "VALID","c2",do_relu=False) return tf.nn.relu(out_res + inputres)
def _provide_data(input_tensors, truncated_length, hparams): """Returns tensors for reading batches from provider.""" (spec, labels, label_weights, length, onsets, filename, note_sequence) = input_tensors length = tf.to_int32(length) labels = tf.reshape(labels, (-1, constants.MIDI_PITCHES)) label_weights = tf.reshape(label_weights, (-1, constants.MIDI_PITCHES)) onsets = tf.reshape(onsets, (-1, constants.MIDI_PITCHES)) spec = tf.reshape(spec, (-1, hparams_frame_size(hparams))) truncated_length = (tf.reduce_min([truncated_length, length]) if truncated_length else length) # Pad or slice specs and labels tensors to have the same lengths, # truncating after truncated_length. spec_delta = tf.shape(spec)[0] - truncated_length spec = tf.case( [(spec_delta < 0, lambda: tf.pad(spec, tf.stack([(0, -spec_delta), (0, 0)]))), (spec_delta > 0, lambda: spec[0:-spec_delta])], default=lambda: spec) labels_delta = tf.shape(labels)[0] - truncated_length labels = tf.case( [(labels_delta < 0, lambda: tf.pad(labels, tf.stack([(0, -labels_delta), (0, 0)]))), (labels_delta > 0, lambda: labels[0:-labels_delta])], default=lambda: labels) label_weights = tf.case( [(labels_delta < 0, lambda: tf.pad(label_weights, tf.stack([(0, -labels_delta), (0, 0)])) ), (labels_delta > 0, lambda: label_weights[0:-labels_delta])], default=lambda: label_weights) onsets = tf.case( [(labels_delta < 0, lambda: tf.pad(onsets, tf.stack([(0, -labels_delta), (0, 0)]))), (labels_delta > 0, lambda: onsets[0:-labels_delta])], default=lambda: onsets) truncated_note_sequence = truncate_note_sequence_op( note_sequence, truncated_length, hparams) batch_tensors = { 'spec': tf.reshape( spec, (truncated_length, hparams_frame_size(hparams), 1)), 'labels': tf.reshape(labels, (truncated_length, constants.MIDI_PITCHES)), 'label_weights': tf.reshape( label_weights, (truncated_length, constants.MIDI_PITCHES)), 'lengths': truncated_length, 'onsets': tf.reshape(onsets, (truncated_length, constants.MIDI_PITCHES)), 'filenames': filename, 'note_sequences': truncated_note_sequence, } return batch_tensors
def _pool_layer(self, inputs, numOut, name = 'pool_layer'): with tf.name_scope(name): bnr_1 = self._bn_relu(inputs) pool = tf.contrib.layers.max_pool2d(bnr_1,[2,2],[2,2],padding='VALID') pad_1 = tf.pad(pool, np.array([[0,0],[1,1],[1,1],[0,0]])) conv_1 = self._conv(pad_1, numOut, kernel_size=3, strides=1, name='conv') bnr_2 = self._bn_relu(conv_1) pad_2 = tf.pad(bnr_2, np.array([[0,0],[1,1],[1,1],[0,0]])) conv_2 = self._conv(pad_2, numOut, kernel_size=3, strides=1, name='conv') upsample = tf.image.resize_nearest_neighbor(conv_2, tf.shape(conv_2)[1:3]*2, name = 'upsampling') return upsample
def kernels_on_grid_summary(kernel, name): """ Returns the Summary with kernel filters displayed in a single grid Visualize conv. features as an image (mostly for the 1st layer). Args: kernel: tensor of shape [Y, X, NumChannels, NumKernels] name: the name displayed in tensorboard """ #TODO: fixme pad = 1 kernel_height = kernel.get_shape()[0].value + pad kernel_width = kernel.get_shape()[1].value + pad depth = kernel.get_shape()[2].value num_kernels = kernel.get_shape()[3].value num_filters = int(num_kernels / depth) square_side = math.ceil(math.sqrt(num_kernels)) grid_height = square_side * kernel_height + 1 grid_width = square_side * kernel_width + 1 # split kernel in num_filters filter and put it into the grid # pad the extracted filter filters = tf.split(3, num_filters, kernel) y_pos, x_pos = 0, 0 # list of tensors cells = [] for inner_filter in filters: filter_3d = tf.squeeze(inner_filter, [3]) # add padding padding = tf.constant([[pad, 0], [pad, 0], [0, 0]]) filter_3d = tf.pad(filter_3d, padding) before_padding = tf.constant([[y_pos, 0], [x_pos, 0], [0, 0]]) bottom_padding = grid_width - y_pos - kernel_width - 1 right_padding = grid_height - x_pos - kernel_height - 1 after_paddng = tf.constant([[bottom_padding, 1], [right_padding, 1], [0, 0]]) cell = tf.pad(filter_3d, before_padding) cells.append(tf.pad(cell, after_paddng)) if right_padding == 0: # move along y y_pos += kernel_height # reset x position x_pos = 0 else: # move along x x_pos += kernel_height grid = tf.reshape(tf.add_n(cells), [1, grid_width, grid_height, depth]) return tf.image_summary(name, grid, max_images=1)
def _pad_tensors_to_same_length(x, y): """Pad x and y so that the results have the same length (second dimension).""" with tf.name_scope("pad_to_same_length"): x_length = tf.shape(x)[1] y_length = tf.shape(y)[1] max_length = tf.maximum(x_length, y_length) x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]]) y = tf.pad(y, [[0, 0], [0, max_length - y_length]]) return x, y
def testPaddingsMaximum(self): with self.test_session(use_gpu=True): with self.assertRaises(Exception): tf.pad( tf.constant([1], shape=[2]), tf.constant([2, 0], shape=[1, 2]), mode="REFLECT").eval() with self.assertRaises(Exception): tf.pad( tf.constant([1], shape=[2]), tf.constant([0, 3], shape=[1, 2]), mode="SYMMETRIC").eval()
def pad_pred_label(predictions, labels): num_digit_predictions = tf.shape(predictions)[-1] num_digit_labels = tf.shape(labels)[-1] paddings_mask = tf.constant([[0,0], [0,1]], dtype=labels.dtype) paddings = tf.cast(tf.fill([2,2], tf.abs(num_digit_predictions-num_digit_labels)),labels.dtype) paddings = paddings * paddings_mask # paddings = tf.constant([[0, 0,], [0, tf.abs(num_digit_predictions-num_digit_predictions)]]) predictions = tf.cond(num_digit_predictions< num_digit_labels, lambda: tf.pad(predictions, paddings, constant_values=-1), lambda: tf.identity(predictions)) labels = tf.cond(num_digit_labels< num_digit_predictions, lambda: tf.pad(labels, paddings, constant_values=-1), lambda: tf.identity(labels)) return predictions, labels
def ReflectPadding2D(x, pad=1): x = Lambda(lambda x: tf.pad(x, [[0, 0], [pad, pad], [pad, pad], [0, 0]], mode='REFLECT'))(x) return x
def shuffle_block(inputs, stride, scope=None): with tf.variable_scope(scope, default_name='shuffle_block'): if stride > 1: # when stride == 2 left_inputs, right_inputs = inputs, inputs else: left_inputs, right_inputs = tf.split(inputs, 2, axis=3) # right branch right_outputs = slim.conv2d(right_inputs, right_inputs.shape[3], 1, stride=1, data_format='NHWC', scope='point_conv1') kernel_size = 3 if stride > 1: pad_wh = math.floor(kernel_size / 2) right_outputs = tf.pad( right_outputs, [[0, 0], [pad_wh, pad_wh], [pad_wh, pad_wh], [0, 0]]) right_outputs = slim.separable_conv2d(right_outputs, None, kernel_size, 1, activation_fn=None, stride=stride, padding='VALID', scope='depth_conv') else: right_outputs = slim.separable_conv2d(right_outputs, None, kernel_size, 1, activation_fn=None, stride=stride, padding='SAME', scope='depth_conv') right_outputs = slim.conv2d(right_outputs, right_outputs.shape[3], 1, stride=1, data_format='NHWC', scope='point_conv2') # left branch if stride > 1: pad_wh = math.floor(kernel_size / 2) left_outputs = tf.pad( left_inputs, [[0, 0], [pad_wh, pad_wh], [pad_wh, pad_wh], [0, 0]]) left_outputs = slim.separable_conv2d(left_outputs, None, kernel_size, 1, activation_fn=None, stride=stride, padding='VALID', scope='depth_conv_left') left_outputs = slim.conv2d(left_outputs, right_outputs.shape[3], 1, stride=1, data_format='NHWC', scope='point_conv_left') else: left_outputs = left_inputs # shuffle outputs = tf.stack([left_outputs, right_outputs], axis=4, name='output') output_shape = outputs.shape outputs = tf.reshape( outputs, [-1, output_shape[1], output_shape[2], 2 * output_shape[3].value]) return outputs
def _build_graph(self, inputs): xys = np.array([(y, x, 1) for y in range(WARP_TARGET_SIZE) for x in range(WARP_TARGET_SIZE)], dtype='float32') xys = tf.constant(xys, dtype=tf.float32, name='xys') # p x 3 image, label = inputs image = image / 255.0 - 0.5 # bhw2 def get_stn(image): stn = (LinearWrap(image) .AvgPooling('downsample', 2) .Conv2D('conv0', 20, 5, padding='VALID') .MaxPooling('pool0', 2) .Conv2D('conv1', 20, 5, padding='VALID') .FullyConnected('fc1', 32) .FullyConnected('fct', 6, activation=tf.identity, kernel_initializer=tf.constant_initializer(), bias_initializer=tf.constant_initializer([1, 0, HALF_DIFF, 0, 1, HALF_DIFF]))()) # output 6 parameters for affine transformation stn = tf.reshape(stn, [-1, 2, 3], name='affine') # bx2x3 stn = tf.reshape(tf.transpose(stn, [2, 0, 1]), [3, -1]) # 3 x (bx2) coor = tf.reshape(tf.matmul(xys, stn), [WARP_TARGET_SIZE, WARP_TARGET_SIZE, -1, 2]) coor = tf.transpose(coor, [2, 0, 1, 3], 'sampled_coords') # b h w 2 sampled = ImageSample('warp', [image, coor], borderMode='constant') return sampled with argscope([Conv2D, FullyConnected], activation=tf.nn.relu): with tf.variable_scope('STN1'): sampled1 = get_stn(image) with tf.variable_scope('STN2'): sampled2 = get_stn(image) # For visualization in tensorboard with tf.name_scope('visualization'): padded1 = tf.pad(sampled1, [[0, 0], [HALF_DIFF, HALF_DIFF], [HALF_DIFF, HALF_DIFF], [0, 0]]) padded2 = tf.pad(sampled2, [[0, 0], [HALF_DIFF, HALF_DIFF], [HALF_DIFF, HALF_DIFF], [0, 0]]) img_orig = tf.concat([image[:, :, :, 0], image[:, :, :, 1]], 1) # b x 2h x w transform1 = tf.concat([padded1[:, :, :, 0], padded1[:, :, :, 1]], 1) transform2 = tf.concat([padded2[:, :, :, 0], padded2[:, :, :, 1]], 1) stacked = tf.concat([img_orig, transform1, transform2], 2, 'viz') tf.summary.image('visualize', tf.expand_dims(stacked, -1), max_outputs=30) sampled = tf.concat([sampled1, sampled2], 3, 'sampled_concat') logits = (LinearWrap(sampled) .FullyConnected('fc1', 256, activation=tf.nn.relu) .FullyConnected('fc2', 128, activation=tf.nn.relu) .FullyConnected('fct', 19, activation=tf.identity)()) tf.nn.softmax(logits, name='prob') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = tf.to_float(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), name='incorrect_vector') summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error')) wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') summary.add_moving_summary(cost, wd_cost) self.cost = tf.add_n([wd_cost, cost], name='cost')
def load(self, ckpt_path, hparams, master='local', batch_timeout_micros=80 * 1000, buckets=None): self.hparams = hparams self.buckets = buckets self.tpu_graph = tf.Graph() tpu_config = tf.ConfigProto( operation_timeout_in_ms=600 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) # Find tpu master. print('master value set to:', master) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( master, zone=None, project=None) master = tpu_cluster_resolver.get_master() self.sess = tf.Session(master, graph=self.tpu_graph, config=tpu_config) with self.tpu_graph.as_default(): self.vocab_table = tf.contrib.lookup.index_to_string_table_from_file( self.vocab_prefix, default_value=vocab_utils.UNK) if self.scenario == 'Offline': with self.tpu_graph.as_default(): self.source = tf.placeholder(shape=(hparams.infer_batch_size, hparams.src_max_len_infer), dtype=tf.int32) self.source_sequence_length = tf.placeholder( shape=(hparams.infer_batch_size), dtype=tf.int32) inputs = [[self.source, self.source_sequence_length]] self.predict_ops.append(self.offline_op(inputs)) else: with self.tpu_graph.as_default(): self.source = tf.placeholder( shape=[None, hparams.src_max_len_infer], dtype=tf.int32) self.source_sequence_length = tf.placeholder(shape=[None], dtype=tf.int32) inputs = [self.source, self.source_sequence_length] for _ in buckets: self.predict_ops.append( self.server_op( inputs, num_batch_threads=16, max_batch_size=hparams.infer_batch_size, batch_timeout_micros=batch_timeout_micros, allowed_batch_sizes=[hparams.infer_batch_size], max_enqueued_batches=10000)) # Add longest sequence predict op. self.predict_ops.append( self.server_op( inputs, num_batch_threads=16, max_batch_size=hparams.infer_batch_size, batch_timeout_micros=5000 * 1000, allowed_batch_sizes=[hparams.infer_batch_size], max_enqueued_batches=10000)) with self.tpu_graph.as_default(): vs = tf.global_variables() assign_ops = [] var_map = {} with tf.variable_scope('f32', dtype=tf.float32): for i in vs: if 'output_projection' in i.name: new_var = tf.get_variable( i.name[:-2], [i.shape[0], hparams.tgt_vocab_size]) assign_ops.append( tf.assign( i, tf.pad( tf.cast(new_var, i.dtype), [[0, 0], [ 0, 128 * (hparams.tgt_vocab_size // 128 + 1) - hparams.tgt_vocab_size ]]))) else: new_var = tf.get_variable(i.name[:-2], i.shape) assign_ops.append( tf.assign(i, tf.cast(new_var, i.dtype))) var_map[i.name[:-2]] = new_var.name[:-2] self.sess.run(tpu.initialize_system()) tf.train.init_from_checkpoint(ckpt_path, var_map) self.sess.run(tf.initializers.global_variables()) self.sess.run(tf.tables_initializer()) self.sess.run(assign_ops) return self
def encoder(self, inputs, n_layers=3): """COnvnet that encodes inputs into mean and std of a gaussian. Args: inputs: 5-D Tensor, shape (batch_size, num_frames, width, height, channels) n_layers: Number of layers. Returns: z_mu: Mean of the latent gaussians. z_log_var: log(var) of the latent gaussians. Raises: ValueError: If inputs is not a 5-D tensor or not float32. """ latent_dims = self.hparams.z_dim shape_as_list = inputs.shape.as_list() if len(shape_as_list) != 5: raise ValueError("Expected inputs to be a 5-D, got %d" % len(shape_as_list)) if inputs.dtype != tf.float32: raise ValueError("Expected dtype tf.float32, got %s" % inputs.dtype) # Flatten (N,T,W,H,C) into (NT,W,H,C) batch_size, _ = shape_as_list[:2] inputs = tf.reshape(inputs, [-1] + list(inputs.shape)[2:]) n_filters = 64 rectified = None # Applies 3 layer conv-net with padding, instance normalization # and leaky relu as per the encoder in # https://github.com/alexlee-gk/video_prediction padding = [[0, 0], [1, 1], [1, 1], [0, 0]] for i in range(n_layers): with tf.variable_scope("layer_%d" % (i + 1)): n_filters *= 2**i if i: padded = tf.pad(rectified, padding) else: padded = tf.pad(inputs, padding) convolved = tf.layers.conv2d(padded, filters=n_filters, kernel_size=4, strides=2, padding="VALID") normalized = tf.contrib.layers.instance_norm(convolved) rectified = tf.nn.leaky_relu(normalized, alpha=0.2) # Mean pooling across all spatial dimensions. pooled = tf.nn.avg_pool(rectified, [1] + rectified.shape[1:3].as_list() + [1], strides=[1, 1, 1, 1], padding="VALID") squeezed = tf.squeeze(pooled, [1, 2]) # Down-project and output the mean and log of the standard deviation of # the latents. with tf.variable_scope("z_mu"): z_mu = tf.layers.dense(squeezed, latent_dims) with tf.variable_scope("z_log_sigma_sq"): z_log_var = tf.layers.dense(squeezed, latent_dims) z_log_var = tf.clip_by_value(z_log_var, -10, 10) # Reshape to (batch_size X num_frames X latent_dims) z_mu = tf.reshape(z_mu, (batch_size, -1, latent_dims)) z_log_var = tf.reshape(z_log_var, (batch_size, -1, latent_dims)) return z_mu, z_log_var
def discriminator(inputdisc, name="discriminator"): with tf.variable_scope(name): f = 4 padw = 2 pad_input = tf.pad(inputdisc, [[0, 0], [padw, padw], [padw, padw], [0, 0]], "CONSTANT") o_c1 = layers.general_conv2d(pad_input, ndf, f, f, 2, 2, 0.02, "VALID", "c1", do_norm=False, relufactor=0.2) pad_o_c1 = tf.pad(o_c1, [[0, 0], [padw, padw], [padw, padw], [0, 0]], "CONSTANT") o_c2 = layers.general_conv2d(pad_o_c1, ndf * 2, f, f, 2, 2, 0.02, "VALID", "c2", relufactor=0.2) pad_o_c2 = tf.pad(o_c2, [[0, 0], [padw, padw], [padw, padw], [0, 0]], "CONSTANT") o_c3 = layers.general_conv2d(pad_o_c2, ndf * 4, f, f, 2, 2, 0.02, "VALID", "c3", relufactor=0.2) pad_o_c3 = tf.pad(o_c3, [[0, 0], [padw, padw], [padw, padw], [0, 0]], "CONSTANT") o_c4 = layers.general_conv2d(pad_o_c3, ndf * 8, f, f, 1, 1, 0.02, "VALID", "c4", relufactor=0.2) pad_o_c4 = tf.pad(o_c4, [[0, 0], [padw, padw], [padw, padw], [0, 0]], "CONSTANT") o_c5 = layers.general_conv2d(pad_o_c4, 1, f, f, 1, 1, 0.02, "VALID", "c5", do_norm=False, do_relu=False) return o_c5
def pad_fn(): return tf.pad( tensor=serialized_list, paddings=[[0, 0], [0, list_size - cur_list_size]], constant_values="")
def call(self, input_tensor, mask=None): padding_width, padding_height = self.padding return tf.pad(input_tensor, [[0, 0], [padding_height, padding_height], [padding_width, padding_width], [0, 0]], 'REFLECT')
def __init__(self, m, n, dim, n_iterations=100, alpha=None, sigma=None, gamma=None, sparsity=None): """ Initializes all necessary components of the TensorFlow Graph. m X n are the dimensions of the SOM. 'n_iterations' should should be an integer denoting the number of iterations undergone while training. 'dim' is the dimensionality of the training inputs. 'gamma' is the edge length of the hypercube defining each neuron's receptive field. A neurons centroid determines the centroid of its receptive field. 'alpha' is a number denoting the initial time(iteration no)-based learning rate. Default value is 0.3 'sigma' is the the initial neighbourhood value, denoting the radius of influence of the BMU while training. By default, its taken to be half of max(m, n). """ #Assign required variables first self._m = m self._n = n if sparsity is None: sparsity = 0.02 else: sparsity = float(sparsity) if alpha is None: alpha = 0.3 else: alpha = float(alpha) if sigma is None: sigma = max(m, n) / 2.0 else: sigma = float(sigma) if gamma is None: gamma = 5 else: gamma = float(gamma) self._n_iterations = abs(int(n_iterations)) ##INITIALIZE GRAPH self._graph = tf.Graph() ##POPULATE GRAPH WITH NECESSARY COMPONENTS with self._graph.as_default(): ##VARIABLES AND CONSTANT OPS FOR DATA STORAGE #Randomly initialized weightage vectors for all neurons, #stored together as a matrix Variable of size [m*n, dim] self._weightage_vects = tf.Variable(tf.random_normal( [m*n, dim])) #List of distances from each centroid to the input vector #self._input_neuron_dist = tf.Variable(m*n) #Matrix of size [m*n, 2] for SOM grid locations #of neurons self._location_vects = tf.constant(np.array( list(self._neuron_locations(m, n)))) #Constant list containing gamma/2 used to mask a tensor later on. self._gamma_mask = tf.constant(gamma/2,shape=[m*n]) ##PLACEHOLDERS FOR TRAINING INPUTS #We need to assign them as attributes to self, since they #will be fed in during training #The training vector self._vect_input = tf.placeholder("float", [dim]) #Iteration number self._iter_input = tf.placeholder("float") ##CONSTRUCT TRAINING OP PIECE BY PIECE #Only the final, 'root' training op needs to be assigned as #an attribute to self, since all the rest will be executed #automatically during training #To compute the Best Matching Units given a vector, #Find all neurons that have the input in their receptive #Fields. Then calculate the distance to the input #from those neurons. The amount of neurons made active # is such that 2% of all neurons active for each input. # returns the index of these neurons in the 98th percentile. vector_differences = tf.norm(tf.subtract(self._weightage_vects, tf.stack ([self._vect_input for i in range(m*n)])),axis=1) # tf.print(vector_differences, [vector_differences], "Distances from" # "centroid to input: ") mask = tf.less(vector_differences, self._gamma_mask/2) # Used to filter out neurons that do not have # the input vector in their receptive field containing_input = tf.boolean_mask(vector_differences, mask) # Contains the neurons that have input in their # receptive fields activation_distance= tf.contrib.distriutions.percentile(containing_input, sparsity) # Top sparsity*100% of # candidates winners_mask = tf.variable(activation_distance, shape=m*n) global_mask = tf.less_equal(vector_differences, winners_mask) # Boolean list of all neurons with # Euclidean distance less than activation_distance bmu_indeces = tf.where(global_mask) # This will extract the locations of the BMUs based on the BMUs indeces slice_input = tf.pad(tf.reshape(bmu_indeces, [1]), np.array([[0, 1]])) bmu_loc = tf.reshape(tf.slice(self._location_vects, slice_input, tf.constant(np.array([1, 2]))), [2]) # To compute the alpha and sigma values based on iteration # number learning_rate_op = tf.subtract(1.0, tf.div(self._iter_input, self._n_iterations)) _alpha_op = tf.multiply(alpha, learning_rate_op) _sigma_op = tf.multiply(sigma, learning_rate_op) #Construct the op that will generate a vector with learning #rates for all neurons, based on iteration number and location #wrt BMU. bmu_distance_squares = tf.reduce_sum(tf.pow(tf.subtract( self._location_vects, tf.stack( [bmu_loc for i in range(m*n)])), 2), 1) neighbourhood_func = tf.exp(tf.negative(tf.div(tf.cast( bmu_distance_squares, "float32"), tf.pow(_sigma_op, 2)))) learning_rate_op = tf.multiply(_alpha_op, neighbourhood_func) #Finally, the op that will use learning_rate_op to update #the weightage vectors of all neurons based on a particular #input learning_rate_multiplier = tf.stack([tf.tile(tf.slice( learning_rate_op, np.array([i]), np.array([1])), [dim]) for i in range(m*n)]) weightage_delta = tf.multiply( learning_rate_multiplier, tf.subtract(tf.stack([self._vect_input for i in range(m*n)]), self._weightage_vects)) new_weightages_op = tf.add(self._weightage_vects, weightage_delta) self._training_op = tf.assign(self._weightage_vects, new_weightages_op) ##INITIALIZE SESSION self._sess = tf.Session() ##INITIALIZE VARIABLES init_op = tf.global_variables_initializer() self._sess.run(init_op)
def deconv(self, x, num_out_layers, kernel_size, scale): p_x = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]]) conv = slim.conv2d_transpose(p_x, num_out_layers, kernel_size, scale, 'SAME') return conv[:, 3:-1, 3:-1, :]
def maxpool(self, x, kernel_size): p = np.floor((kernel_size - 1) / 2).astype(np.int32) p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]]) return slim.max_pool2d(p_x, kernel_size)
def cnnmodel(frame1_xyz, frame1_rgb, frame2_xyz, frame2_rgb): frame1_feat_rgb, _ = get_network('resnet50', frame1_rgb, weight_decay=1e-5, is_training=True) frame2_feat_rgb, _ = get_network('resnet50', frame2_rgb, weight_decay=1e-5, is_training=True, reuse=True) frame1_feat = encoder(frame1_xyz) frame2_feat = encoder(frame2_xyz, reuse=True) cc_o = correlation(frame2_feat_rgb, frame1_feat_rgb, 1, rad, 1, 1, rad) cc = tf.reshape(cc_o, [-1, 30 * 40, dia * dia, 1]) cc_weight = tf.nn.relu(cc) frame1_feat_o = frame1_feat frame1_feat = tf.transpose(frame1_feat, [0, 3, 1, 2]) frame1_feat_padded = tf.pad(frame1_feat, paddings=[[0, 0], [0, 0], [rad, rad], [rad, rad]]) frame1_list = [] for i in xrange(30): for j in xrange(40): tmp = frame1_feat_padded[:, :, 0 + i:2 * rad + 1 + i, 0 + j:2 * rad + 1 + j] tmp = tf.reshape(tmp, [-1, 64, dia * dia]) frame1_list.append(tmp) frame1_list = tf.stack(frame1_list, axis=2) frame1_list = tf.transpose(frame1_list, [0, 2, 3, 1]) frame1_list = frame1_list * cc_weight frame1_list = tf.nn.max_pool(frame1_list, ksize=[1, 1, dia * dia, 1], strides=[1, 1, dia * dia, 1], padding='VALID') frame1_list = tf.reshape(frame1_list, (-1, 30, 40, 64)) x = tf.concat([frame2_feat, frame1_feat_o, frame1_list], 3) x_s = decoder(x) x_transl = tflearn.layers.conv.conv_2d(x_s, 3, (3, 3), strides=1, activation='linear', weight_decay=1e-3, regularizer='L2') rot_quaternion = tflearn.layers.conv.conv_2d(x_s, 4, (3, 3), strides=1, activation='linear', weight_decay=1e-3, regularizer='L2') ### quaternion normalize quaternion_norm = tf.norm(rot_quaternion, axis=3) * tf.sign( rot_quaternion[:, :, :, 0]) quaternion_norm = tf.expand_dims(quaternion_norm, -1) x_quaternion = rot_quaternion / (quaternion_norm + 0.000001) w1, x1, y1, z1 = tf.unstack(x_quaternion, axis=-1) x2, y2, z2 = tf.unstack(frame2_xyz, axis=-1) wm = -x1 * x2 - y1 * y2 - z1 * z2 xm = w1 * x2 + y1 * z2 - z1 * y2 ym = w1 * y2 + z1 * x2 - x1 * z2 zm = w1 * z2 + x1 * y2 - y1 * x2 x = -wm * x1 + xm * w1 - ym * z1 + zm * y1 y = -wm * y1 + ym * w1 - zm * x1 + xm * z1 z = -wm * z1 + zm * w1 - xm * y1 + ym * x1 x_flow = tf.stack((x, y, z), axis=-1) x_flow = x_flow + x_transl - frame2_xyz x_center = tflearn.layers.conv.conv_2d(x_s, 3, (3, 3), strides=1, activation='linear', weight_decay=1e-3, regularizer='L2') x_score = tflearn.layers.conv.conv_2d(x_s, 2, (3, 3), strides=1, activation='linear', weight_decay=1e-3, regularizer='L2') x_mask = tflearn.layers.conv.conv_2d(x_s, 2, (3, 3), strides=1, activation='linear', weight_decay=1e-3, regularizer='L2') x_boundary = tflearn.layers.conv.conv_2d(x_s, 2, (3, 3), strides=1, activation='linear', weight_decay=1e-3, regularizer='L2') x_center = tf.add(x_center, frame2_xyz) xc, yc, zc = tf.unstack(x_center, axis=-1) wmc = -x1 * xc - y1 * yc - z1 * zc xmc = w1 * xc + y1 * zc - z1 * yc ymc = w1 * yc + z1 * xc - x1 * zc zmc = w1 * zc + x1 * yc - y1 * xc xc = -wmc * x1 + xmc * w1 - ymc * z1 + zmc * y1 yc = -wmc * y1 + ymc * w1 - zmc * x1 + xmc * z1 zc = -wmc * z1 + zmc * w1 - xmc * y1 + ymc * x1 x_center_p = tf.stack((xc, yc, zc), axis=-1) + x_transl x_traj = tf.concat([x_center, x_center_p], 3) return x_quaternion, x_transl, x_traj, x_flow, x_center, x_mask, x_score, x_boundary
def __call__(self, image, reuse=None): with tf.variable_scope(self.name, reuse=reuse): act = tf.nn.relu kwargs_downsample = { "kernel_size": (4, 4), "strides": (4, 4), "padding": "valid" } # image is 256x256x3 image = tf.layers.conv2d(image, filters=128, **kwargs_downsample, activation=act) # image is 64x64x128 image = tf.layers.conv2d(image, filters=256, **kwargs_downsample, activation=act) # image is 16x16x256 image = tf.layers.conv2d(image, filters=512, **kwargs_downsample, activation=act) # -------------- image is 4x4x512 pad = [[0, 0], [2, 2], [2, 2], [0, 0]] kwargs_upsample = { "kernel_size": (5, 5), "strides": (1, 1), "padding": "valid" } res_met = tf.image.ResizeMethod.NEAREST_NEIGHBOR image = tf.pad(image, pad, mode="SYMMETRIC") image = tf.layers.conv2d(image, filters=512, **kwargs_upsample, activation=act) image = tf.image.resize_images(image, (16, 16), method=res_met) # image is 16x16x256 image = tf.pad(image, pad, mode="SYMMETRIC") image = tf.layers.conv2d(image, filters=256, **kwargs_upsample, activation=act) image = tf.image.resize_images(image, (64, 64), method=res_met) # image is 64x64x128 image = tf.pad(image, pad, mode="SYMMETRIC") image = tf.layers.conv2d(image, filters=128, **kwargs_upsample, activation=act) image = tf.image.resize_images(image, (256, 256), method=res_met) # image is 256x256x128 image = tf.pad(image, pad, mode="SYMMETRIC") image = tf.layers.conv2d(image, filters=3, activation=tf.nn.sigmoid, **kwargs_upsample) # image is 256x256x3 return image
def __init__(self, data, training=False): self.data = data self.initializer = tf.orthogonal_initializer() q_mask = tf.sequence_mask(self.data.ql, maxlen=25) # (1, L_q) s_mask = tf.sequence_mask(self.data.sl, maxlen=29) # (N, L_s) a_mask = tf.sequence_mask(self.data.al, maxlen=34) # (5, L_a) with tf.variable_scope('Embedding'): self.embedding = tf.get_variable('embedding_matrix', initializer=np.load( _mp.embedding_file), trainable=False) self.ques = tf.nn.embedding_lookup(self.embedding, self.data.ques) # (1, L_q, E) self.ans = tf.nn.embedding_lookup(self.embedding, self.data.ans) # (5, L_a, E) self.subt = tf.nn.embedding_lookup(self.embedding, self.data.subt) # (N, L_s, E) # self.ques = tf.layers.dropout(self.ques, hp['dropout_rate'], training=training) # (1, L_q, E) # self.ans = tf.layers.dropout(self.ans, hp['dropout_rate'], training=training) # (5, L_a, E) # self.subt = tf.layers.dropout(self.subt, hp['dropout_rate'], training=training) # (N, L_s, E) with tf.variable_scope('Embedding_Linear'): self.ques_embedding = tf.layers.dense( self.ques, hp['emb_dim'], use_bias=False, kernel_initializer=self.initializer) # (1, L_q, E_t) self.ans_embedding = tf.layers.dense(self.ans, hp['emb_dim'], use_bias=False, reuse=True) # (5, L_a, E_t) self.subt_embedding = tf.layers.dense( self.subt, hp['emb_dim'], use_bias=False, reuse=True, ) # (N, L_s, E_t) with tf.variable_scope('Language_Encode'): position_attn = tf.get_variable( 'position_attention', shape=[hp['pos_len'], hp['emb_dim']], initializer=self.initializer, trainable=False) ques_pos, _ = tf.split(position_attn, [25, hp['pos_len'] - 25]) ans_pos, _ = tf.split(position_attn, [34, hp['pos_len'] - 34]) subt_pos, _ = tf.split(position_attn, [29, hp['pos_len'] - 29]) q_qa_enc, a_qa_enc = language_encode(self.ques, self.ans, self.data.ql, self.data.al, ques_pos, ans_pos) q_qs_enc, s_qs_enc = language_encode(self.ques, self.subt, self.data.ql, self.data.sl, ques_pos, subt_pos) a_as_enc, s_as_enc = language_encode(self.ans, self.subt, self.data.al, self.data.sl, ans_pos, subt_pos) self.ques_enc = tf.layers.dense( tf.concat([q_qa_enc, q_qs_enc], axis=-1), hp['feat_dim'], kernel_initializer=self.initializer, activation=tf.nn.tanh) # (1, L_q, 2 * E_t) self.ans_enc = tf.layers.dense( tf.concat([a_qa_enc, a_as_enc], axis=-1), hp['feat_dim'], kernel_initializer=self.initializer, activation=tf.nn.tanh) # (5, L_a, 2 * E_t) self.subt_enc = tf.layers.dense( tf.concat([s_qs_enc, s_as_enc], axis=-1), hp['feat_dim'], kernel_initializer=self.initializer, activation=tf.nn.tanh) # (N, L_s, 2 * E_t) # # self.ques_enc = tf.layers.dense(self.ques_enc, hp['feat_dim']) # (1, L_q, 2 * E_t) # self.ans_enc = tf.layers.dense(self.ques_enc, hp['feat_dim']) # (5, L_a, 2 * E_t) # self.subt_enc = tf.layers.dense(self.ques_enc, hp['feat_dim']) # (N, L_s, 2 * E_t) # # self.m_subt = tf.layers.dense( # self.subt_enc, hp['feat_dim'], use_bias=False, name='encode_transform') # (N, F_t) # self.m_ques = tf.layers.dense( # self.ques_enc, hp['feat_dim'], use_bias=False, reuse=True, name='encode_transform') # (1, F_t) # self.m_ans = tf.layers.dense( # self.ans_enc, hp['feat_dim'], use_bias=False, reuse=True, name='encode_transform') # (5, F_t) # # self.m_subt = tf.layers.dropout(self.m_subt, hp['dropout_rate'], training=training) # self.m_ques = tf.layers.dropout(self.m_ques, hp['dropout_rate'], training=training) # self.m_ans = tf.layers.dropout(self.m_ans, hp['dropout_rate'], training=training) # t_shape = tf.shape(self.subt_enc) split_num = tf.cast(tf.ceil(t_shape[0] / 5), dtype=tf.int32) pad_num = split_num * 5 - t_shape[0] paddings = tf.convert_to_tensor([[0, pad_num], [0, 0]]) with tf.variable_scope('Memory_Block'): self.mem_feat = tf.pad(self.subt_enc, paddings) self.mem_block = tf.reshape(self.mem_feat, [split_num, 5, hp['feat_dim']]) self.mem_node = tf.reduce_mean(self.mem_block, axis=1) self.mem_opt = tf.layers.dense(self.mem_node, hp['feat_dim'], activation=tf.nn.tanh, kernel_initializer=self.initializer) self.mem_direct = tf.matmul( self.mem_node, self.mem_opt, transpose_b=True) / (hp['feat_dim']**0.5) self.mem_fw_direct = tf.nn.softmax(self.mem_direct) self.mem_bw_direct = tf.nn.softmax(self.mem_direct, axis=0) self.mem_self = tf.matmul(self.mem_fw_direct, self.mem_node) + tf.matmul( self.mem_bw_direct, self.mem_node) self.mem_attn = tf.nn.softmax( tf.matmul(self.mem_self, self.ques_enc, transpose_b=True)) self.mem_output = tf.reduce_sum(self.mem_self * self.mem_attn, axis=0) self.output = tf.reduce_sum(self.mem_output * self.ans_enc, axis=1)
def project_mesh(self, args): height = args['height'] orientation = args['orientation'] # Adjust our height and orientation if 'mesh' in self.variants: v = self.variants['mesh'] if 'height' in v: height = height + tf.truncated_normal( shape=(), mean=v['height']['mean'], stddev=v['height']['stddev'], ) if 'rotation' in v: # Make 3 random euler angles rotation = tf.truncated_normal( shape=[3], mean=v['rotation']['mean'], stddev=v['rotation']['stddev'], ) # Cos and sin for everyone! ca = tf.cos(rotation[0]) sa = tf.sin(rotation[0]) cb = tf.cos(rotation[1]) sb = tf.sin(rotation[0]) cc = tf.cos(rotation[2]) sc = tf.sin(rotation[0]) # Convert these into a rotation matrix rot = [cc*ca, -cc*sa*cb + sc*sb, cc*sa*sb + sc*cb, sa, ca*cb, -ca * sb, -sc*ca, sc*sa*cb + cc*sb, -sc*sa*sb + cc*cb] # yapf: disable rot = tf.reshape(tf.stack(rot), [3, 3]) # Apply the rotation orientation = tf.matmul(rot, orientation) # Run the visual mesh to get our values pixels, neighbours = VisualMesh( tf.shape(args['image']), args['projection'], args['focal_length'], args['fov'], orientation, height, self.geometry, self.geometry_params, name='ProjectVisualMesh', ) # Round to integer pixels # TODO one day someone could do linear interpolation here, like what happens in the OpenCL version pixels = tf.cast(tf.round(pixels), dtype=tf.int32) # Select the points in the network and discard the old dictionary data # We pad one extra point at the end for the offscreen point return { 'X': tf.pad(tf.gather_nd(args['image'], pixels), [[0, 1], [0, 0]]), 'Y': tf.pad(tf.gather_nd(args['mask'], pixels), [[0, 1], [0, 0]]), 'G': neighbours, 'px': pixels, 'raw': args['raw'], }
def _symmetric_pad(i, x): paddings_i = tf.map_fn(lambda e: tf.where(i < e, 1, 0), paddings) paddings_i = tf.reshape(paddings_i, [num_dim, 2]) x = tf.pad(x, paddings_i, 'SYMMETRIC') return i + 1, x
def __init__( self, sequence_length, vocab_size, embedding_type, embedding_size, filter_sizes, num_filters, fc_hidden_size, num_classes, l2_reg_lambda=0.0, pretrained_embedding=None): # Placeholders for input, output, dropout_prob and training_tag self.input_x_front = tf.placeholder(tf.int32, [None, sequence_length], name="input_x_front") self.input_x_behind = tf.placeholder(tf.int32, [None, sequence_length], name="input_x_behind") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.is_training = tf.placeholder(tf.bool, name="is_training") self.global_step = tf.Variable(0, trainable=False, name="Global_Step") def _cos_sim(input_x1, input_x2): norm1 = tf.square(tf.reduce_sum(tf.square(input_x1), axis=1)) norm2 = tf.square(tf.reduce_sum(tf.square(input_x2), axis=1)) dot_products = tf.reduce_sum(input_x1 * input_x2, axis=1, name="cos_sim") return dot_products / (norm1 * norm2) def _make_attention_mat(input_x1, input_x2): # shape of `input_x1` and `input_x2`: [batch_size, embedding_size, sequence_length, 1] # input_x2 need to transpose to the [batch_size, embedding_size, 1, sequence_length] # shape of output: [batch_size, sequence_length, sequence_length] dist = tf.reduce_sum(tf.square(input_x1 - tf.matrix_transpose(input_x2)), axis=1) euclidean = tf.sqrt(tf.maximum(dist, 1e-10)) return 1.0 / (1.0 + euclidean) def _w_pool(input_x, attention, filter_size, scope): # input_x: [batch_size, num_filters, sequence_length + filter_size - 1, 1] # attention: [batch_size, sequence_length + filter_size - 1] pools = [] # [batch_size, 1, sequence_length + filter_size - 1, 1] attention = tf.transpose(tf.expand_dims(tf.expand_dims(attention, axis=-1), axis=-1), perm=[0, 2, 1, 3]) for i in range(sequence_length): # [batch_size, num_filters, filter_size, 1] # reduce_sum => [batch_size, num_filters, 1, 1] pools.append( tf.reduce_sum(input_x[:, :, i:i + filter_size, :] * attention[:, :, i:i + filter_size, :], axis=2, keepdims=True)) # [batch_size, num_filters, sequence_length, 1] w_ap = tf.concat(pools, axis=2, name="w_ap_" + scope) return w_ap def _all_pool(input_x, filter_size, scope): # input_x: [batch_size, num_filters, sequence_length + filter_size -1, 1] all_ap = tf.nn.avg_pool( input_x, ksize=[1, 1, sequence_length + filter_size - 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="all_pool_" + scope ) all_ap_reshaped = tf.reshape(all_ap, shape=[-1, num_filters]) return all_ap_reshaped def _linear(input_, output_size, scope="SimpleLinear"): """ Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k] Args: input_: a tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. scope: VariableScope for the created subgraph; defaults to "SimpleLinear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ shape = input_.get_shape().as_list() if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: {0}".format(str(shape))) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: {0}".format(str(shape))) input_size = shape[1] # Now the computation. with tf.variable_scope(scope): W = tf.get_variable("W", [input_size, output_size], dtype=input_.dtype) b = tf.get_variable("b", [output_size], dtype=input_.dtype) return tf.nn.xw_plus_b(input_, W, b) def _highway_layer(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu): """ Highway Network (cf. http://arxiv.org/abs/1505.00387). t = sigmoid(Wy + b) z = t * g(Wy + b) + (1 - t) * y where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. """ for idx in range(num_layers): g = f(_linear(input_, size, scope=("highway_lin_{0}".format(idx)))) t = tf.sigmoid(_linear(input_, size, scope=("highway_gate_{0}".format(idx))) + bias) output = t * g + (1. - t) * input_ input_ = output return output # Embedding Layer with tf.device("/cpu:0"), tf.name_scope("embedding"): # Use random generated the word vector by default # Can also be obtained through our own word vectors trained by our corpus if pretrained_embedding is None: self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0, dtype=tf.float32), trainable=True, name="embedding") else: if embedding_type == 0: self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding") if embedding_type == 1: self.embedding = tf.Variable(pretrained_embedding, trainable=True, dtype=tf.float32, name="embedding") embedded_sentence_front = tf.nn.embedding_lookup(self.embedding, self.input_x_front) embedded_sentence_behind = tf.nn.embedding_lookup(self.embedding, self.input_x_behind) # transpose the embedding sentence: [batch_size, embedding_size, sequence_length] embedded_sentence_front_trans = tf.transpose(embedded_sentence_front, perm=[0, 2, 1]) embedded_sentence_behind_trans = tf.transpose(embedded_sentence_behind, perm=[0, 2, 1]) # [batch_size, embedding_size, sequence_length, 1] embedded_sentence_expanded_front_trans = tf.expand_dims(embedded_sentence_front_trans, axis=-1) embedded_sentence_expanded_behind_trans = tf.expand_dims(embedded_sentence_behind_trans, axis=-1) # shape of `L0_0` and `R0_0`: [batch_size, embedding_size] self.F0_0 = tf.reshape(tf.reduce_mean(embedded_sentence_front, axis=1), shape=[-1, embedding_size]) self.B0_0 = tf.reshape(tf.reduce_mean(embedded_sentence_behind, axis=1), shape=[-1, embedding_size]) # Attention Layer with tf.name_scope("attention_matrix"): W_a = tf.Variable(tf.truncated_normal(shape=[sequence_length, embedding_size], stddev=0.1, dtype=tf.float32), name="W_a") # shape of `attention_matrix`: [batch_size, sequence_length, sequence_length] attention_matrix = _make_attention_mat(embedded_sentence_expanded_front_trans, embedded_sentence_expanded_behind_trans) # [batch_size, sequence_length, sequence_length] * [sequence_length, embedding_size] # einsum => [batch_size, sequence_length, embedding_size] # matrix transpose => [batch_size, embedding_size, sequence_length] # expand dims => [batch_size, embedding_size, sequence_length, 1] front_attention = tf.expand_dims(tf.matrix_transpose( tf.einsum("ijk,kl->ijl", attention_matrix, W_a)), axis=-1) behind_attention = tf.expand_dims(tf.matrix_transpose( tf.einsum("ijk,kl->ijl", tf.matrix_transpose(attention_matrix), W_a)), axis=-1) # shape of new `embedded_sentence_expanded_trans`: [batch_size, embedding_size, sequence_length, 2] embedded_sentence_expanded_front_trans = tf.concat([embedded_sentence_expanded_front_trans, front_attention], axis=3) embedded_sentence_expanded_behind_trans = tf.concat([embedded_sentence_expanded_behind_trans, behind_attention], axis=3) # Convolution layer pooled_outputs_wp_front = [] pooled_outputs_wp_behind = [] pooled_outputs_ap_front = [] pooled_outputs_ap_behind = [] for filter_size in filter_sizes: with tf.name_scope("conv-filter{0}".format(filter_size)): in_channels = 2 # The in_channels of filter_shape is 2 (two channels, origin + attention) # shape of new `embedded_sentence_expanded` # [batch_size, embedding_size, sequence_length + filter_size - 1, 2] input_x1 = tf.pad(embedded_sentence_expanded_front_trans, np.array( [[0, 0], [0, 0], [filter_size - 1, filter_size - 1], [0, 0]]), mode="CONSTANT") input_x2 = tf.pad(embedded_sentence_expanded_behind_trans, np.array( [[0, 0], [0, 0], [filter_size - 1, filter_size - 1], [0, 0]]), mode="CONSTANT") filter_shape = [embedding_size, filter_size, in_channels, num_filters] W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W") b = tf.Variable(tf.constant(value=0.1, shape=[num_filters], dtype=tf.float32), name="b") conv_front = tf.nn.conv2d( input_x1, W, strides=[1, 1, 1, 1], padding="VALID", name="conv_front") conv_behind = tf.nn.conv2d( input_x2, W, strides=[1, 1, 1, 1], padding="VALID", name="conv_behind") # Apply nonlinearity # [batch_size, 1, sequence_length + filter_size - 1, num_filters] conv_out_front = tf.nn.relu(tf.nn.bias_add(conv_front, b), name="relu_front") conv_out_behind = tf.nn.relu(tf.nn.bias_add(conv_behind, b), name="relu_behind") # [batch_size, num_filters, sequence_length + filter_size - 1, 1] conv_out_front_trans = tf.transpose(conv_out_front, perm=[0, 3, 2, 1]) conv_out_behind_trans = tf.transpose(conv_out_behind, perm=[0, 3, 2, 1]) with tf.name_scope("attention-filter{0}".format(filter_size)): # [batch_size, sequence_length + filter_size - 1, sequence_length + filter_size - 1] attention_matrix_v2 = _make_attention_mat(conv_out_front_trans, conv_out_behind_trans) # [batch_size, sequence_length + filter_size - 1] front_attention_v2 = tf.reduce_sum(attention_matrix_v2, axis=2) behind_attention_v2 = tf.reduce_sum(attention_matrix_v2, axis=1) with tf.name_scope("pool-filter{0}".format(filter_size)): # shape of `front_wp`: [batch_size, num_filters, sequence_length, 1] front_wp = _w_pool(input_x=conv_out_front_trans, attention=front_attention_v2, filter_size=filter_size, scope="front") behind_wp = _w_pool(input_x=conv_out_behind_trans, attention=behind_attention_v2, filter_size=filter_size, scope="behind") # shape of `front_ap`: [batch_size, num_filters] front_ap = _all_pool(input_x=conv_out_front_trans, filter_size=filter_size, scope="front") behind_ap = _all_pool(input_x=conv_out_behind_trans, filter_size=filter_size, scope="behind") pooled_outputs_wp_front.append(front_wp) pooled_outputs_wp_behind.append(behind_wp) pooled_outputs_ap_front.append(front_ap) pooled_outputs_ap_behind.append(behind_ap) # shape of `FI_1` & `BI_1`: [batch_size, num_filters_total, sequence_length, 1] self.FI_1 = tf.concat(pooled_outputs_wp_front, axis=1) self.BI_1 = tf.concat(pooled_outputs_wp_behind, axis=1) # shape of `F0_1` & `B0_1`: [batch_size, num_filters_total] self.F0_1 = tf.concat(pooled_outputs_ap_front, axis=1) self.B0_1 = tf.concat(pooled_outputs_ap_behind, axis=1) # Concat Layer num_filters_total = num_filters * len(filter_sizes) # shape of `conv_front` & `conv_behind`: [batch_size, embedding_size + num_filters_total] self.conv_front = tf.concat([self.F0_0, self.F0_1], axis=1) self.conv_behind = tf.concat([self.B0_0, self.B0_1], axis=1) self.sims = tf.stack([_cos_sim(self.F0_0, self.B0_0), _cos_sim(self.F0_1, self.B0_1)], axis=1) # shape of `conv_combine`: [batch_size, 2 * (embedding_size + num_filters_total)] self.conv_combine = tf.concat([self.conv_front, self.conv_behind], axis=1) # Fully Connected Layer with tf.name_scope("fc"): W = tf.Variable(tf.truncated_normal(shape=[2 * (embedding_size + num_filters_total), fc_hidden_size], stddev=0.1, dtype=tf.float32), name="W") b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b") self.fc = tf.nn.xw_plus_b(self.conv_combine, W, b) # Apply nonlinearity self.fc_out = tf.nn.relu(self.fc, name="relu") # Highway Layer with tf.name_scope("highway"): self.highway = _highway_layer(self.fc_out, self.fc_out.get_shape()[1], num_layers=1, bias=0) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.highway, self.dropout_keep_prob) # Final scores and predictions with tf.name_scope("output"): W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, num_classes], stddev=0.1, dtype=tf.float32), name="W") b = tf.Variable(tf.constant(value=0.1, shape=[num_classes], dtype=tf.float32), name="b") self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logits") self.softmax_scores = tf.nn.softmax(self.logits, name="softmax_scores") self.predictions = tf.argmax(self.logits, 1, name="predictions") self.topKPreds = tf.nn.top_k(self.softmax_scores, k=1, sorted=True, name="topKPreds") # Calculate mean cross-entropy loss, L2 loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.input_y, logits=self.logits) losses = tf.reduce_mean(losses, name="softmax_losses") l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()], name="l2_losses") * l2_reg_lambda self.loss = tf.add(losses, l2_losses, name="loss")
def nms(normalized_boxes, scores): idxs_ = tf.image.non_max_suppression(normalized_boxes, scores, self.proposal_count, self.nms_thresh) box = tf.gather(normalized_boxes, idxs_) pad_num = tf.maximum(self.proposal_count - tf.shape(normalized_boxes)[0],0) box = tf.pad(box, [(0, pad_num), (0,0)]) return box
def cryptonets_train(x): """Builds the graph for classifying digits based on Cryptonets Args: x: an input tensor with the dimensions (N_examples, 784), where 784 is the number of pixels in a standard MNIST image. Returns: A tuple (y, a scalar placeholder). y is a tensor of shape (N_examples, 10), with values equal to the logits of classifying the digit into one of 10 classes (the digits 0-9). """ # Reshape to use within a conv neural net. # Last dimension is for "features" - there is only one here, since images # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc. with tf.name_scope('reshape'): x_image = tf.reshape(x, [-1, 28, 28, 1]) paddings = tf.constant([[0, 0], [0, 1], [0, 1], [0, 0]], name='pad_const') x_image = tf.pad(x_image, paddings) # First conv layer # CryptoNets's output of the first conv layer has feature map size 13 x 13, # therefore, we manually add paddings. # Input: N x 28 x 28 x 1 # Filter: 5 x 5 x 1 x 5 # Output: N x 12 x 12 x 5 # Output after padding: N x 13 x 13 x 5 with tf.name_scope('conv1'): W_conv1 = tf.get_variable("W_conv1", [5, 5, 1, 5]) h_conv1_no_pad = tf.square( common.conv2d_stride_2_valid(x_image, W_conv1)) paddings = tf.constant([[0, 0], [0, 1], [0, 1], [0, 0]], name='pad_const') h_conv1 = tf.pad(h_conv1_no_pad, paddings) # Pooling layer # Input: N x 13 x 13 x 5 # Output: N x 13 x 13 x 5 with tf.name_scope('pool1'): h_pool1 = common.avg_pool_3x3_same_size(h_conv1) # Second convolution # Input: N x 13 x 13 x 5 # Filter: 5 x 5 x 5 x 50 # Output: N x 5 x 5 x 50 with tf.name_scope('conv2'): W_conv2 = tf.get_variable("W_conv2", [5, 5, 5, 50]) h_conv2 = common.conv2d_stride_2_valid(h_pool1, W_conv2) # Second pooling layer # Input: N x 5 x 5 x 50 # Output: N x 5 x 5 x 50 with tf.name_scope('pool2'): h_pool2 = common.avg_pool_3x3_same_size(h_conv2) # Fully connected layer 1 # Input: N x 5 x 5 x 50 # Input flattened: N x 1250 # Weight: 1250 x 100 # Output: N x 100 with tf.name_scope('fc1'): h_pool2_flat = tf.reshape(h_pool2, [-1, 5 * 5 * 50]) W_fc1 = tf.get_variable("W_fc1", [5 * 5 * 50, 100]) h_fc1 = tf.square(tf.matmul(h_pool2_flat, W_fc1)) # Map the 100 features to 10 classes, one for each digit # Input: N x 100 # Weight: 100 x 10 # Output: N x 10 with tf.name_scope('fc2'): W_fc2 = tf.get_variable("W_fc2", [100, 10]) y_conv = tf.matmul(h_fc1, W_fc2) return y_conv
def decoding_graph(features, state, mode, params): if mode != "train": params.residual_dropout = 0.0 params.attention_dropout = 0.0 params.relu_dropout = 0.0 params.label_smoothing = 0.0 tgt_seq = features["target"] src_len = features["source_length"] mem_len = features["memory_length"] tgt_len = features["target_length"] src_mask = tf.sequence_mask(src_len, maxlen=tf.shape(features["source"])[1], dtype=tf.float32) mem_mask = tf.sequence_mask(mem_len, maxlen=tf.shape(features["memory"])[1], dtype=tf.float32) tgt_mask = tf.sequence_mask(tgt_len, maxlen=tf.shape(features["target"])[1], dtype=tf.float32) hidden_size = params.hidden_size tvocab = params.vocabulary["target"] tgt_vocab_size = len(tvocab) initializer = tf.random_normal_initializer(0.0, params.hidden_size**-0.5) # if params.use_pretrained_embedding: # trg_emb_initializer = tf.constant_initializer(features['trg_embs']) # if params.shared_source_target_embedding: # with tf.variable_scope(tf.get_variable_scope(), reuse=True): # tgt_embedding = tf.get_variable("shared_embedding", # [tgt_vocab_size, hidden_size], # initializer=trg_emb_initializer) # else: # tgt_embedding = tf.get_variable("target_embedding", # [tgt_vocab_size, hidden_size], # initializer=trg_emb_initializer) # else: if params.shared_source_target_embedding: with tf.variable_scope(tf.get_variable_scope(), reuse=True): tgt_embedding = tf.get_variable("shared_embedding", [tgt_vocab_size, hidden_size], initializer=initializer) else: tgt_embedding = tf.get_variable("target_embedding", [tgt_vocab_size, hidden_size], initializer=initializer) if params.shared_embedding_and_softmax_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): if params.shared_source_target_embedding: weights = tf.get_variable("shared_embedding", [tgt_vocab_size, hidden_size], initializer=initializer) else: weights = tf.get_variable("target_embedding", [tgt_vocab_size, hidden_size], initializer=initializer) else: weights = tf.get_variable("softmax_weights", [tgt_vocab_size, hidden_size], initializer=initializer) # id => embedding # tgt_seq: [batch, max_tgt_length] targets = tf.gather(tgt_embedding, tgt_seq) * (hidden_size**0.5) targets = targets * tf.expand_dims(tgt_mask, -1) # Preparing encoder and decoder input #enc_attn_bias = attention_bias(src_mask, "masking") enc_attn_bias = attention_bias(tf.concat([src_mask, mem_mask], 1), "masking") dec_attn_bias = attention_bias(tf.shape(targets)[1], "causal") # Shift left decoder_input = tf.pad(targets, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] decoder_input = add_timing_signal(decoder_input) if params.residual_dropout is not None and params.residual_dropout > 0: decoder_input = tf.nn.dropout(decoder_input, 1.0 - params.residual_dropout) encoder_output = state["encoder"] if mode != "infer": decoder_output = transformer_decoder(decoder_input, encoder_output, dec_attn_bias, enc_attn_bias, params) else: decoder_input = decoder_input[:, -1:, :] dec_attn_bias = dec_attn_bias[:, :, -1:, :] decoder_outputs = transformer_decoder(decoder_input, encoder_output, dec_attn_bias, enc_attn_bias, params, state=state["decoder"]) decoder_output, decoder_state = decoder_outputs decoder_output = decoder_output[:, -1, :] logits = tf.matmul(decoder_output, weights, False, True) log_prob = tf.nn.log_softmax(logits) return log_prob, {"encoder": encoder_output, "decoder": decoder_state} # [batch, length, hidden] => [batch * length, vocab_size] decoder_output = tf.reshape(decoder_output, [-1, hidden_size]) logits = tf.matmul(decoder_output, weights, False, True) labels = features["target"] # label smoothing ce = smoothed_softmax_cross_entropy(logits, labels, params.label_smoothing, True) ce = tf.reshape(ce, tf.shape(tgt_seq)) if mode == "eval": return -tf.reduce_sum(ce * tgt_mask, axis=1) loss = tf.reduce_sum(ce * tgt_mask) / tf.reduce_sum(tgt_mask) return loss
def conv0_space_to_depth(inputs, use_fused_bn=False, data_format='channels_last'): """Strided 2-D convolution with explicit padding. The padding is consistent and is based only on `kernel_size`, not on the dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). Args: inputs: `Tensor` of size `[batch, height_in, width_in, channels]`. use_fused_bn: 'bool' whether use fused batch norm variables. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A `Tensor` with the same type as `inputs`. """ # Create the conv0 kernel w.r.t. the original image size. (no space-to-depth). filters = 64 kernel_size = 7 space_to_depth_block_size = ssd_constants.SPACE_TO_DEPTH_BLOCK_SIZE strides = 2 conv0 = tf.layers.Conv2D( filters=filters, kernel_size=kernel_size, strides=2, padding=('SAME' if strides == 1 else 'VALID'), use_bias=True if use_fused_bn else False, kernel_initializer=tf.variance_scaling_initializer(), data_format=data_format, name='conv1_1') # Use the image size without space-to-depth transform as the input of conv0. batch_size, h, w, channel = inputs.get_shape().as_list() conv0.build([ batch_size, h * space_to_depth_block_size, w * space_to_depth_block_size, channel / (space_to_depth_block_size**2) ]) kernel = conv0.weights[0] # [7, 7, 3, 64] --> [8, 8, 3, 64] kernel = tf.pad(kernel, paddings=tf.constant([[1, 0], [1, 0], [0, 0], [0, 0]]), mode='CONSTANT', constant_values=0.) # Transform kernel follows the space-to-depth logic: http://shortn/_9YvHW96xPJ kernel = tf.reshape(kernel, [ 4, space_to_depth_block_size, 4, space_to_depth_block_size, 3, filters ]) kernel = tf.transpose(kernel, [0, 2, 1, 3, 4, 5]) kernel = tf.reshape(kernel, [4, 4, int(channel), filters]) kernel = tf.cast(kernel, inputs.dtype) inputs = space_to_depth_fixed_padding(inputs, kernel_size, data_format, space_to_depth_block_size) outputs = tf.nn.conv2d( input=inputs, filter=kernel, strides=[1, 1, 1, 1], padding='VALID', data_format='NHWC' if data_format == 'channels_last' else 'NCHW', name='conv1_1') if use_fused_bn: # The additional bias is used as the batch norm is fused into the conv # layer. return tf.nn.bias_add(outputs, tf.cast(conv0.weights[1], outputs.dtype), data_format='NHWC') else: return outputs
def zero_padding(self, input, paddings, name): pad_mat = np.array([[0,0], [paddings, paddings], [paddings, paddings], [0, 0]]) return tf.pad(input, paddings=pad_mat, name=name)
def CNN(x): stride = 2 # dropout_rate dropout_rate = 1.0 if a.load_model is not True: dropout_rate = 0.5 with tf.variable_scope('CNN', reuse=tf.AUTO_REUSE): with tf.variable_scope('layer1'): # [n,28,28,1] -> [n,14,14,64] padded = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]], mode="CONSTANT") w = tf.get_variable(name='w1', shape=[4, 4, 1, 64], dtype=tf.float32, initializer=tf.random_normal_initializer( 0, 0.02)) b = tf.get_variable(name='b1', shape=[64], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) out = tf.nn.leaky_relu( batchnorm( tf.nn.conv2d( padded, w, [1, stride, stride, 1], padding='VALID') + b), 0.2) with tf.variable_scope('layer2'): # [n,14,14,64] -> [n,7,7,128] ([n,7x7x128]) padded = tf.pad(out, [[0, 0], [1, 1], [1, 1], [0, 0]], mode="CONSTANT") w = tf.get_variable(name='w2', shape=[4, 4, 64, 128], dtype=tf.float32, initializer=tf.random_normal_initializer( 0, 0.02)) b = tf.get_variable(name='b2', shape=[128], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) out = tf.nn.leaky_relu( batchnorm( tf.nn.conv2d( padded, w, [1, stride, stride, 1], padding='VALID') + b), 0.2) out = tf.reshape(out, [-1, 7 * 7 * 128]) with tf.variable_scope('layer3'): # [n,7*7*128] -> [n,1024] w = tf.get_variable(name='w3', shape=[7 * 7 * 128, 1024], dtype=tf.float32, initializer=tf.random_normal_initializer( 0, 0.02)) b = tf.get_variable(name='b3', shape=[1024], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) out = tf.nn.leaky_relu(batchnorm(tf.matmul(out, w) + b), 0.2) # dropout out = tf.nn.dropout(out, dropout_rate) with tf.variable_scope('layer4'): # [n,1024] -> [n,10] w = tf.get_variable(name='w4', shape=[1024, 10], dtype=tf.float32, initializer=tf.random_normal_initializer( 0, 0.02)) b = tf.get_variable(name='b4', shape=[10], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) out = tf.nn.softmax(batchnorm(tf.matmul(out, w) + b)) return out
def pad(tensor, num=1): return tf.pad(tensor, [[0, 0], [num, num], [num, num], [0, 0]], "CONSTANT")
def encoder_bottleneck_regular(self, x, output_depth, scope, keep_prob, proj_ratio=4, downsampling=False): input_shape = x.get_shape().as_list() input_depth = input_shape[3] internal_depth = int(output_depth / proj_ratio) # convolution branch: conv_branch = x # # 1x1 projection: if downsampling: W_conv = self.get_variable_weight_decay( scope + "/W_proj", shape=[2, 2, input_depth, internal_depth], # ([filter_height, filter_width, in_depth, out_depth]) initializer=tf.contrib.layers.xavier_initializer(), loss_category="encoder_wd_losses") conv_branch = tf.nn.conv2d(conv_branch, W_conv, strides=[1, 2, 2, 1], padding="VALID") # NOTE! no bias terms else: W_proj = self.get_variable_weight_decay( scope + "/W_proj", shape=[1, 1, input_depth, internal_depth], # ([filter_height, filter_width, in_depth, out_depth]) initializer=tf.contrib.layers.xavier_initializer(), loss_category="encoder_wd_losses") conv_branch = tf.nn.conv2d(conv_branch, W_proj, strides=[1, 1, 1, 1], padding="VALID") # NOTE! no bias terms # # # batch norm and PReLU: conv_branch = tf.contrib.slim.batch_norm(conv_branch) conv_branch = PReLU(conv_branch, scope=scope + "/proj") # # conv: W_conv = self.get_variable_weight_decay( scope + "/W_conv", shape=[3, 3, internal_depth, internal_depth], # ([filter_height, filter_width, in_depth, out_depth]) initializer=tf.contrib.layers.xavier_initializer(), loss_category="encoder_wd_losses") b_conv = self.get_variable_weight_decay( scope + "/b_conv", shape=[internal_depth], # ([out_depth]) initializer=tf.constant_initializer(0), loss_category="encoder_wd_losses") conv_branch = tf.nn.conv2d( conv_branch, W_conv, strides=[1, 1, 1, 1], padding="SAME") + b_conv # # # batch norm and PReLU: conv_branch = tf.contrib.slim.batch_norm(conv_branch) conv_branch = PReLU(conv_branch, scope=scope + "/conv") # # 1x1 expansion: shape = [1, 1, internal_depth, output_depth] W_exp = self.get_variable_weight_decay( scope + "/W_exp", shape=shape, # ([filter_height, filter_width, in_depth, out_depth]) initializer=tf.contrib.layers.xavier_initializer(), loss_category="encoder_wd_losses") W_exp = tf.reshape(drop_connect(W_exp, self.keep_prob_pl), shape=shape) conv_branch = tf.nn.conv2d(conv_branch, W_exp, strides=[1, 1, 1, 1], padding="VALID") # NOTE! no bias terms # # # batch norm: conv_branch = tf.contrib.slim.batch_norm(conv_branch) # NOTE! no PReLU here # # regularizer: # conv_branch = dropout(conv_branch, self.keep_prob_pl) # main branch: main_branch = x if downsampling: # max pooling with argmax (for use in max_unpool in the decoder): main_branch, pooling_indices = tf.nn.max_pool_with_argmax( main_branch, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") # (everytime we downsample, we also increase the feature block depth) # pad with zeros so that the feature block depth matches: depth_to_pad = output_depth - input_depth paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, depth_to_pad]]) # (paddings is an integer tensor of shape [4, 2] where 4 is the rank # of main_branch. For each dimension D (D = 0, 1, 2, 3) of main_branch, # paddings[D, 0] is the no of values to add before the contents of # main_branch in that dimension, and paddings[D, 0] is the no of # values to add after the contents of main_branch in that dimension) main_branch = tf.pad(main_branch, paddings=paddings, mode="CONSTANT") # add the branches: merged = conv_branch + main_branch # apply PReLU: output = PReLU(merged, scope=scope + "/output") if downsampling: return output, pooling_indices else: return output
def prepare_processing_graph(self, model_settings, summaries_dir): """ 建立张量流图以应用输入失真。 创建一个图形,加载一个WAVE文件,对其进行解码、缩放体积、平移, 添加背景噪声,计算一个声谱图,然后从中生成MFCC特征。 必须在TensorFlow会话运行时调用它,它会创建多个占位符输入和一个输出:: - wav_filename_placeholder_: 音频文件名 - foreground_volume_placeholder_: 主剪辑的声音应该有多大 - time_shift_padding_placeholder_: 在哪个位置剪辑 - time_shift_offset_placeholder_: 在剪辑上移动多少 - background_data_placeholder_: 背景噪声的PCM采样数据 - background_volume_placeholder_: 背景中混音的响度 - output_: 经过处理后的二维输出 Args: model_settings: 正在训练的当前模型信息 summaries_dir: 保存训练摘要信息的路径 """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) #允许调整音频样本的音量 self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # 移动样本的起始位置,并用零填充任何间隙 self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # 混入背景噪音 self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # 运行频谱图和MFCC节点来获取音频的二维特征 spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) #频谱图中每个FFT行中的桶数将取决于每个窗口中有多少输入样本。 #不需要详细分类,希望缩小它们以产生更小的结果。 #一种方法是使用平均法来遍历相邻的bucket,更复杂的方法是应用MFCC算法来缩小表示。 if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (model_settings['preprocess'])) # 合并所有摘要并将其写入/tmp/retrain_日志 self.merged_summaries_ = tf.summary.merge_all(scope='data') self.summary_writer_ = tf.summary.FileWriter( summaries_dir + '/data', tf.get_default_graph())
def _k_grads(self, X1, X2): r""" Vectorized kernel calc and kernel grad calc. Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp Input is two tensors of shape (# strings , # characters) and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n) D is the tensor than unrolls the recursion and allows vecotrizaiton """ # turn into one-hot i.e. shape (# strings, #characters+1, alphabet size) X1 = tf.one_hot(X1, len(self.alphabet) + 1, dtype=tf.float64) X2 = tf.one_hot(X2, len(self.alphabet) + 1, dtype=tf.float64) # remove the ones in the first column that encode the padding (i.e we dont want them to count as a match) paddings = tf.constant([[0, 0], [0, 0], [0, len(self.alphabet)]]) X1 = X1 - tf.pad(tf.expand_dims(X1[:, :, 0], 2), paddings, "CONSTANT") X2 = X2 - tf.pad(tf.expand_dims(X2[:, :, 0], 2), paddings, "CONSTANT") # store squared match coef match_sq = tf.square(self.match_decay) # Make S: the similarity tensor of shape (# strings, #characters, # characters) S = tf.matmul(X1, tf.transpose(X2, perm=(0, 2, 1))) # Main loop, where Kp, Kpp values and gradients are calculated. Kp = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) dKp_dgap = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) dKp_dmatch = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) Kp = Kp.write( Kp.size(), tf.ones(shape=tf.stack([tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) dKp_dgap = dKp_dgap.write( dKp_dgap.size(), tf.zeros(shape=tf.stack( [tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) dKp_dmatch = dKp_dmatch.write( dKp_dmatch.size(), tf.zeros(shape=tf.stack( [tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) # calc subkernels for each subsequence length for i in tf.range(0, self.max_subsequence_length - 1): Kp_temp = tf.multiply(S, Kp.read(i)) Kp_temp0 = match_sq * Kp_temp Kp_temp1 = tf.matmul(Kp_temp0, self.D) Kp_temp2 = tf.matmul(self.D, Kp_temp1, transpose_a=True) Kp = Kp.write(Kp.size(), Kp_temp2) dKp_dgap_temp_1 = tf.matmul(self.dD_dgap, Kp_temp1, transpose_a=True) dKp_dgap_temp_2 = tf.multiply(S, dKp_dgap.read(i)) dKp_dgap_temp_2 = dKp_dgap_temp_2 * match_sq dKp_dgap_temp_2 = tf.matmul(dKp_dgap_temp_2, self.D) dKp_dgap_temp_2 = dKp_dgap_temp_2 + tf.matmul( Kp_temp0, self.dD_dgap) dKp_dgap_temp_2 = tf.matmul(self.D, dKp_dgap_temp_2, transpose_a=True) dKp_dgap = dKp_dgap.write(dKp_dgap.size(), dKp_dgap_temp_1 + dKp_dgap_temp_2) dKp_dmatch_temp_1 = 2 * tf.divide(Kp_temp2, self.match_decay) dKp_dmatch_temp_2 = tf.multiply(S, dKp_dmatch.read(i)) dKp_dmatch_temp_2 = dKp_dmatch_temp_2 * match_sq dKp_dmatch_temp_2 = tf.matmul(dKp_dmatch_temp_2, self.D) dKp_dmatch_temp_2 = tf.matmul(self.D, dKp_dmatch_temp_2, transpose_a=True) dKp_dmatch = dKp_dmatch.write( dKp_dmatch.size(), dKp_dmatch_temp_1 + dKp_dmatch_temp_2) # Final calculation. We gather all Kps Kp_stacked = Kp.stack() Kp.close() dKp_dgap_stacked = dKp_dgap.stack() dKp_dgap.close() dKp_dmatch_stacked = dKp_dmatch.stack() dKp_dmatch.close() # get k temp = tf.multiply(S, Kp_stacked) temp = tf.reduce_sum(temp, -1) sum2 = tf.reduce_sum(temp, -1) Ki = sum2 * match_sq k = tf.linalg.matvec(tf.transpose(Ki), self.order_coefs) k = tf.expand_dims(k, 1) # get gap decay grads temp = tf.multiply(S, dKp_dgap_stacked) temp = tf.reduce_sum(temp, -1) temp = tf.reduce_sum(temp, -1) temp = temp * match_sq dk_dgap = tf.linalg.matvec(tf.transpose(temp), self.order_coefs) dk_dgap = tf.expand_dims(dk_dgap, 1) # get match decay grads temp = tf.multiply(S, dKp_dmatch_stacked) temp = tf.reduce_sum(temp, -1) temp = tf.reduce_sum(temp, -1) temp = temp * match_sq temp = temp + 2 * self.match_decay * sum2 dk_dmatch = tf.linalg.matvec(tf.transpose(temp), self.order_coefs) dk_dmatch = tf.expand_dims(dk_dmatch, 1) dk_dcoefs = tf.transpose(Ki) return k, dk_dgap, dk_dmatch, dk_dcoefs
def sorted_non_max_suppression_padded(scores, boxes, classes, max_output_size, iou_threshold): """A wrapper that handles non-maximum suppression. Assumption: * The boxes are sorted by scores unless the box is a dot (all coordinates are zero). * Boxes with higher scores can be used to suppress boxes with lower scores. The overal design of the algorithm is to handle boxes tile-by-tile: boxes = boxes.pad_to_multiply_of(tile_size) num_tiles = len(boxes) // tile_size output_boxes = [] for i in range(num_tiles): box_tile = boxes[i*tile_size : (i+1)*tile_size] for j in range(i - 1): suppressing_tile = boxes[j*tile_size : (j+1)*tile_size] iou = bbox_overlap(box_tile, suppressing_tile) # if the box is suppressed in iou, clear it to a dot box_tile *= _update_boxes(iou) # Iteratively handle the diagnal tile. iou = _box_overlap(box_tile, box_tile) iou_changed = True while iou_changed: # boxes that are not suppressed by anything else suppressing_boxes = _get_suppressing_boxes(iou) # boxes that are suppressed by suppressing_boxes suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes) # clear iou to 0 for boxes that are suppressed, as they cannot be used # to suppress other boxes any more new_iou = _clear_iou(iou, suppressed_boxes) iou_changed = (new_iou != iou) iou = new_iou # remaining boxes that can still suppress others, are selected boxes. output_boxes.append(_get_suppressing_boxes(iou)) if len(output_boxes) >= max_output_size: break Args: scores: a tensor with a shape of [batch_size, anchors]. boxes: a tensor with a shape of [batch_size, anchors, 4]. max_output_size: a scalar integer `Tensor` representing the maximum number of boxes to be selected by non max suppression. iou_threshold: a float representing the threshold for deciding whether boxes overlap too much with respect to IOU. Returns: nms_scores: a tensor with a shape of [batch_size, anchors]. It has same dtype as input scores. nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has same dtype as input boxes. """ batch_size = tf.shape(boxes)[0] num_boxes = tf.shape(boxes)[1] pad = tf.cast(tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE), tf.int32) * NMS_TILE_SIZE - num_boxes boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]]) scores = tf.pad(tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1) classes = tf.pad(tf.cast(classes, tf.float32), [[0, 0], [0, pad]], constant_values=-1) num_boxes += pad def _loop_cond(unused_boxes, unused_threshold, output_size, idx): return tf.logical_and( tf.reduce_min(output_size) < max_output_size, idx < num_boxes // NMS_TILE_SIZE) selected_boxes, _, output_size, _ = tf.while_loop( _loop_cond, _suppression_loop_body, [ boxes, iou_threshold, tf.zeros([batch_size], tf.int32), tf.constant(0) ]) idx = num_boxes - tf.cast( tf.nn.top_k( tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) * tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0], tf.int32) idx = tf.minimum(idx, num_boxes - 1) idx = tf.reshape( idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1]) boxes = tf.reshape(tf.gather(tf.reshape(boxes, [-1, 4]), idx), [batch_size, max_output_size, 4]) boxes = boxes * tf.cast( tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape( output_size, [-1, 1, 1]), boxes.dtype) scores = tf.reshape(tf.gather(tf.reshape(scores, [-1, 1]), idx), [batch_size, max_output_size]) scores = scores * tf.cast( tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape( output_size, [-1, 1]), scores.dtype) classes = tf.reshape(tf.gather(tf.reshape(classes, [-1, 1]), idx), [batch_size, max_output_size]) classes = classes * tf.cast( tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape( output_size, [-1, 1]), classes.dtype) return scores, boxes, classes
def conv2d(input_tensor, kernel, bias): kernel = np.transpose(kernel, [2, 3, 1, 0]) x = tf.pad(input_tensor, [[0,0], [1,1], [1,1], [0,0]]) x = tf.nn.conv2d(x, tf.constant(kernel), (1,1,1,1), 'VALID') x = tf.nn.bias_add(x, tf.constant(bias)) return tf.nn.relu(x)
with tf.variable_scope("block_embedding"): block_embedding = tf.get_variable("block_embedding_b", [NUM_BLOCKS, block_emb_size], trainable=True) set_block_embedding = tf_util.create_row_setter(block_embedding, "set_block_embedding") # %% with tf.variable_scope("conv1_1"): filter_bank = tf.get_variable("filter_bank", [2, 2, 2, block_emb_size, layer1_size], trainable=True) voxels_padded = tf.pad(voxels % NUM_BLOCKS, [[0, 0], [2, 2], [2, 2], [2, 2]]) voxel_emb_1 = tf.nn.conv3d(tf.nn.embedding_lookup(block_embedding, voxels_padded), filter_bank, strides=[1, 1, 1, 1, 1], padding="VALID") voxel_emb_1s = tf.nn.sigmoid(voxel_emb_1) # %% with tf.variable_scope("conv2_1"): filter_bank_2 = tf.get_variable("filter_bank", [4, 4, 4, layer1_size, layer2_size], trainable=True)