def routing(input, b_IJ): W = tf.get_variable( 'Weight', shape=(1, 2592, 320, 16, 1), dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=cfg.stddev)) biases = tf.get_variable('bias', shape=(1, 1, 10, 32, 1)) input = tf.tile(input, [1, 1, 320, 1, 1]) u_hat = reduce_sum(W * input, axis=3, keepdims=True) u_hat = tf.reshape(u_hat, shape=[-1, 2592, 10, 32, 1]) u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient') for r_iter in range(cfg.iter_routing): with tf.variable_scope('iter_' + str(r_iter)): c_IJ = softmax(b_IJ, axis=2) if r_iter == cfg.iter_routing - 1: s_J = tf.multiply(c_IJ, u_hat) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) elif r_iter < cfg.iter_routing - 1: # Inner iterations, do not apply backpropagation s_J = tf.multiply(c_IJ, u_hat_stopped) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) v_J_tiled = tf.tile(v_J, [1, 2592, 1, 1, 1]) u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True) b_IJ += u_produce_v return (v_J)
def dynamic_routing(shape, input, num_outputs=10, num_dims=16): """The Dynamic Routing Algorithm proposed by Sabour et al.""" input_shape = shape W = tf.get_variable('Weight', shape=[1, input_shape[1], num_dims * num_outputs] + input_shape[-2:], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=stddev)) biases = tf.get_variable('bias', shape=(1, 1, num_outputs, num_dims, 1)) delta_IJ = tf.zeros([input_shape[0], input_shape[1], num_outputs, 1, 1], dtype=tf.dtypes.float32) input = tf.tile(input, [1, 1, num_dims * num_outputs, 1, 1]) u_hat = reduce_sum(W * input, axis=3, keepdims=True) u_hat = tf.reshape(u_hat, shape=[-1, input_shape[1], num_outputs, num_dims, 1]) u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient') for r_iter in range(iter_routing): with tf.variable_scope('iter_' + str(r_iter)): gamma_IJ = softmax(delta_IJ, axis=2) if r_iter == iter_routing - 1: s_J = tf.multiply(gamma_IJ, u_hat) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) elif r_iter < iter_routing - 1: # Inner iterations, do not apply backpropagation s_J = tf.multiply(gamma_IJ, u_hat_stopped) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) v_J_tiled = tf.tile(v_J, [1, input_shape[1], 1, 1, 1]) u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True) delta_IJ += u_produce_v return(v_J)
def routing(input, b_IJ): # W - start of routing algorithm, initialization W = tf.get_variable('Weight', shape=(1, 1152, 160, 8, 1), dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=cfg.stddev)) biases = tf.get_variable('bias', shape=(1, 1, 10, 16, 1)) # u_hat.png input = tf.tile(input, [1, 1, 160, 1, 1]) u_hat = reduce_sum(W * input, axis=3, keepdims=True) u_hat = tf.reshape(u_hat, shape=[-1, 1152, 10, 16, 1]) # In forward, u_hat_no_back_propogation = u_hat; in backward, no gradient passed back from u_hat_no_back_propogation to u_hat u_hat_no_back_propogation = tf.stop_gradient(u_hat, name='stop_gradient') # routing.png, cycle for r_iter in range(cfg.iter_routing): with tf.variable_scope('iter_' + str(r_iter)): c_IJ = softmax(b_IJ, axis=2) # last iteration, use u_hat for back-propogation if r_iter == cfg.iter_routing - 1: s_J = tf.multiply(c_IJ, u_hat) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) elif r_iter < cfg.iter_routing - 1: # Inner routing iterations, no back-propogation, so use u_hat_no_back_propogation s_J = tf.multiply(c_IJ, u_hat_no_back_propogation) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) v_J_tiled = tf.tile(v_J, [1, 1152, 1, 1, 1]) u_produce_v = reduce_sum(u_hat_no_back_propogation * v_J_tiled, axis=3, keepdims=True) b_IJ += u_produce_v return(v_J)
def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1, return tensor with shape [batch_size, 20, 20, 256] conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') # Primary Capsules layer, return tensor with shape [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) # DigitCaps layer, return shape [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=self.num_label, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) # Decoder structure in Fig. 2 # 1. Do masking, how: with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) # assert self.softmax_v.get_shape() == [cfg.batch_size, self.num_label, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) # assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, self.num_label, 1))) self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024) self.decoded = tf.contrib.layers.fully_connected(fc2, num_outputs=self.height * self.width * self.channels, activation_fn=tf.sigmoid)
def biuld_net(self): # gragh = tf.Graph() # with gragh.as_default(): ########### ### set top conv top_con = CNNs(self.x,128,[9,1],2,"SAME",self.is_train) self.primary_cap = layers_vector(top_con,32,4,[9,1],2,self.is_train,shapes=[-1,self.next_length*8,16,1]) # [-1,88*16,8,1] #with tf.variable_scope("capsules_layers"): fc_function = tf.reshape(self.primary_cap,shape=(-1, self.primary_cap.shape[1].value,1, self.primary_cap.shape[-2].value,1)) #with tf.variable_scope("routing"): #[-1,88*16,1,8,1] blu = tf.constant( np.zeros([self.batch_size, self.primary_cap.shape[1].value,self.num_label,1,1]),dtype=tf.float32 ) caps = routing(fc_function,blu,num_outputs=self.num_label ,num_dims=32) #### [120,37,8,1] top_conv_1 = CNNs(self.x,128,[7,1],2,"SAME",self.is_train) self.primary_cap_1 = layers_vector(top_conv_1,32,4,[7,1],2,self.is_train,shapes=[-1,self.next_length*16,8,1]) fc_function_1 = tf.reshape(self.primary_cap_1,shape=(-1,self.primary_cap_1.shape[1].value,1,self.primary_cap_1.shape[-2].value,1)) blu_1 = tf.constant(np.zeros([self.batch_size,self.primary_cap_1.shape[1].value,self.num_label,1,1]),dtype=tf.float32) with tf.variable_scope("routint_1"): caps_1 = routing(fc_function_1,blu_1,self.num_label,16) top_con_2 = CNNs(self.x,128,[5,1],2,'SAME',self.is_train) self.primary_cap_2 = layers_vector(top_con_2,32,4,[5,1],2,self.is_train,shapes=[-1,self.next_length*32,4,1]) fc_function_2 = tf.reshape(self.primary_cap_2,shape=(-1,self.primary_cap_2.shape[1].value,1,self.primary_cap_2.shape[-2].value,1)) blu_2 = tf.constant(np.zeros([self.batch_size,self.primary_cap_2.shape[1].value,self.num_label,1,1]),dtype=tf.float32) with tf.variable_scope("routing_2"): caps_2 = routing(fc_function_2,blu_2,self.num_label,8) a = 3.0 b = 1.0 c = 1.0 # a = 3.0 # b = 1.0 caps = tf.concat([a*caps,b*caps_1,c*caps_2],axis=3) # This is the best performance in our experiments. self.caps = tf.squeeze(caps,axis=1) v_length = tf.sqrt(reduce_sum(tf.square(self.caps),axis=2,keepdims=True)+eposilion) softmax_v = softmax(v_length,axis=1) #########[batch_size,num_label,1,1] argmax_idx = tf.to_int32(tf.argmax(softmax_v,axis=1)) self.argmax_idx = tf.reshape(argmax_idx,shape=(self.batch_size,)) ### self.masked_v = tf.multiply(tf.squeeze(self.caps),tf.reshape(self.y,(-1,self.num_label,1))) self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps),axis=2,keepdims=True)+eposilion) ######## # decoder vector_j = tf.reshape(self.masked_v,shape=(self.batch_size,-1)) fc1 = tf.contrib.layers.fully_connected(vector_j,num_outputs=256) fc1 = tf.contrib.layers.fully_connected(fc1,num_outputs=512) self.decode = tf.contrib.layers.fully_connected(fc1,num_outputs=self.length,activation_fn=tf.sigmoid)
def gradient_penalty(x, y, mask=None, norm=1., f_obj_to_img=None): """ # x = interpolated real and fake images # y = scores from critic """ grad_outputs = torch.ones(y.size()).cuda( ) if torch.cuda.is_available() else torch.ones(y.size()) gradients = torch.autograd.grad(outputs=y, inputs=x, grad_outputs=grad_outputs, create_graph=True, retain_graph=True, only_inputs=True)[0] if mask is None: mask = torch.ones(gradients.shape, device=x.device) gp_grads = (gradients ** 2) * mask if f_obj_to_img is not None: avg_grads = [] # take average of all patches for every image for i in range(max(f_obj_to_img) + 1): inds = (f_obj_to_img == i).nonzero() avg_grad = torch.mean(gp_grads[inds], dim=0) avg_grads.append(avg_grad) gp_grads = torch.cat(avg_grads) slopes = torch.sqrt(utils.reduce_sum( gp_grads, axis=[2, 3, 1])).view(gp_grads.shape[0], -1) gp_loss = torch.mean((slopes - norm) ** 2) return gp_loss
def init_net_treecaps(feature_size, label_size): """Initialize an empty TreeCaps network.""" top_a = 20 top_b = 25 num_conv = 8 output_size = 128 caps1_num_dims = 8 caps1_num_caps = int(num_conv*output_size/caps1_num_dims)*top_a caps1_out_caps = label_size caps1_out_dims = 8 with tf.name_scope('inputs'): nodes = tf.placeholder(tf.float32, shape=(None, None, feature_size), name='tree') children = tf.placeholder(tf.int32, shape=(None, None, None), name='children') with tf.name_scope('network'): """The Primary Variable Capsule Layer.""" primary_variable_caps = primary_variable_capsule_layer(num_conv, output_size, nodes, children, feature_size, caps1_num_dims) """The Primary Static Capsule Layer.""" primary_static_caps = vts_routing(primary_variable_caps,top_a,top_b,caps1_num_caps,caps1_num_dims) primary_static_caps = tf.reshape(primary_static_caps, shape=(batch_size, -1, 1, caps1_num_dims, 1)) """The Code Capsule Layer.""" #Get the input shape to the dynamic routing algorithm dr_shape = [batch_size,caps1_num_caps,1,caps1_num_dims,1] codeCaps = dynamic_routing(dr_shape, primary_static_caps, num_outputs=caps1_out_caps, num_dims=caps1_out_dims) codeCaps = tf.squeeze(codeCaps, axis=1) """Obtaining the classification output.""" v_length = tf.sqrt(reduce_sum(tf.square(codeCaps),axis=2, keepdims=True) + 1e-9) out = tf.reshape(v_length,(-1,label_size)) return nodes, children, out
def squash(vector): '''Squashing function corresponding to Eq. 1 Args: vector: A tensor with shape [batch_size, 1, num_caps, vec_len, 1] or [batch_size, num_caps, vec_len, 1]. Returns: A tensor with the same shape as vector but squashed in 'vec_len' dimension. ''' vec_squared_norm = reduce_sum(tf.square(vector), -2, keepdims=True) scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + epsilon) vec_squashed = scalar_factor * vector # element-wise return(vec_squashed)
def squash(vector): '''Squashing function Args: vector: A tensor with shape [batch_size, 1, num_caps, vec_len, 1] or [batch_size, num_caps, vec_len, 1] Returns: A tensor with the same shape as vector but squashed in 'vec_len' dimension. ''' squared_norm = reduce_sum(tf.square(vector), axis=-2, keepdims=True) scalar_factor = squared_norm / (1 + squared_norm) / tf.sqrt(squared_norm + epsilon) return (scalar_factor * vector)
def squash(vector): ''' Input: tensor with shape: [batch_size, 1, num_caps, vec_len, 1] Return: same shape. squashed in vec_len dimension ''' vec_squashed_norm = reduce_sum(tf.square(vector), -2, keepdims=True) scalar_factor = vec_squashed_norm / ( 1 + vec_squashed_norm) / tf.sqrt(vec_squashed_norm + epsilon) vec_squashed = scalar_factor * vector return (vec_squashed)
def evaluator_model(input): epsilon = 1e-9 with tf.variable_scope('CapsuleNet', reuse=tf.AUTO_REUSE): with tf.variable_scope('Conv1_layer'): # conv1 = tf.contrib.layers.conv2d(input, num_outputs=128, kernel_size=9, stride=1, padding='VALID') # conv1 = tf.contrib.layers.conv2d(conv1, num_outputs=256, kernel_size=5, stride=1, padding='VALID') conv1 = tf.contrib.layers.conv2d(input, num_outputs=512, kernel_size=9, stride=1, padding='VALID') with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=16, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=8, stride=2) with tf.variable_scope('SecondaryCaps_Layer'): DigitCaps = CapsLayer(num_outputs=10, vec_len=32, with_routing=True, layer_type='FC') Caps2 = DigitCaps(caps1) v_length = tf.sqrt( reduce_sum(tf.square(Caps2), axis=2, keepdims=True) + epsilon, name='v_length') print(v_length) # # with tf.variable_scope('Masking'): # masked_v = tf.multiply(tf.squeeze(Caps2), tf.reshape(y, (-1, 10, 1)), name='masked_v') # print('Masked_V: ', masked_v) # # with tf.variable_scope('Decoder'): # vector_j = tf.reshape(masked_v, shape=(cfg.batch_size, -1)) # fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) # assert fc1.get_shape() == [cfg.batch_size, 512] # fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024) # assert fc2.get_shape() == [cfg.batch_size, 1024] # decoded = tf.contrib.layers.fully_connected(fc2, num_outputs=784, activation_fn=tf.sigmoid) return v_length, Caps2
def discriminator(input, isTrain=True, reuse=False): epsilon = 1e-9 with tf.variable_scope('discriminator') as scope: if reuse: labels = tf.constant(0, shape=[ cfg.batch_size, ]) else: labels = tf.constant(1, shape=[ cfg.batch_size, ]) Y = tf.one_hot(labels, depth=2, axis=1, dtype=tf.float32) if reuse: scope.reuse_variables() with tf.variable_scope('Conv1_layer'): conv1 = tf.contrib.layers.conv2d(input, num_outputs=256, kernel_size=9, stride=1, padding='VALID') with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=2, vec_len=16, with_routing=True, layer_type='FC') caps2 = digitCaps(caps1) # batch size x 2 x 16 x 1 v_length = tf.sqrt( reduce_sum(tf.square(caps2), axis=2, keepdims=True) + epsilon) max_l = tf.square(tf.maximum(0., cfg.m_plus - v_length)) max_r = tf.square(tf.maximum(0., v_length - cfg.m_minus)) max_l = tf.reshape(max_l, shape=(cfg.batch_size, -1)) max_r = tf.reshape(max_r, shape=(cfg.batch_size, -1)) T_c = Y L_c = T_c * max_l + cfg.lambda_val * (1 - T_c) * max_r margin_loss = tf.reduce_mean(tf.reduce_sum(L_c, axis=1)) return margin_loss
def evaluate(results, num_batches, evaluate_fn): num_batches = tools.reduce_sum(num_batches) results = tools.collect_results_gpu(results, num_batches.item()) rank = tools.get_dist_info()[0] if rank == 0: all_embeds = list(map(lambda r: r[0], results)) all_labels = list(map(lambda r: r[1], results)) all_embeds = { k: torch.cat(tuple(map(lambda r: r[k], all_embeds)), dim=0) for k in all_embeds[0].keys() } all_labels = torch.cat(all_labels, dim=0) metrics = evaluate_fn(all_embeds, all_labels) early_stop_criterion = torch.tensor([metrics['criterion']], device='cuda') else: metrics = None early_stop_criterion = torch.tensor([0.], device='cuda') # early_stop_criterion is used for all ranks, so broadcast it early_stop_criterion = tools.broadcast(early_stop_criterion, 0) return metrics, early_stop_criterion
def forward(self, f, b, mask=None): """ Contextual attention layer implementation. Contextual attention is first introduced in publication: Generative Image Inpainting with Contextual Attention, Yu et al. Args: f: Input feature to match (foreground). b: Input feature for match (background). mask: Input mask for b, indicating patches not available. ksize: Kernel size for contextual attention. stride: Stride for extracting patches from b. rate: Dilation for matching. softmax_scale: Scaled softmax for attention. Returns: torch.tensor: output """ # get shapes raw_int_fs = list(f.size()) # b*c*h*w raw_int_bs = list(b.size()) # b*c*h*w # extract patches from background with stride and rate kernel = 2 * self.rate # raw_w is extracted for reconstruction raw_w = utils.extract_image_patches(b, ksizes=[kernel, kernel], strides=[self.rate*self.stride, self.rate*self.stride], rates=[1, 1], padding='same') # [N, C*k*k, L] # raw_shape: [N, C, k, k, L] [4, 192, 4, 4, 1024] raw_w = raw_w.view(raw_int_bs[0], raw_int_bs[1], kernel, kernel, -1) raw_w = raw_w.permute(0, 4, 1, 2, 3) # raw_shape: [N, L, C, k, k] raw_w_groups = torch.split(raw_w, 1, dim=0) # downscaling foreground option: downscaling both foreground and # background for matching and use original background for reconstruction. f = F.interpolate(f, scale_factor=1./self.rate, mode='nearest') b = F.interpolate(b, scale_factor=1./self.rate, mode='nearest') int_fs = list(f.size()) # b*c*h*w int_bs = list(b.size()) f_groups = torch.split(f, 1, dim=0) # split tensors along the batch dimension # w shape: [N, C*k*k, L] w = utils.extract_image_patches(b, ksizes=[self.ksize, self.ksize], strides=[self.stride, self.stride], rates=[1, 1], padding='same') # w shape: [N, C, k, k, L] w = w.view(int_bs[0], int_bs[1], self.ksize, self.ksize, -1) w = w.permute(0, 4, 1, 2, 3) # w shape: [N, L, C, k, k] w_groups = torch.split(w, 1, dim=0) # process mask mask = F.interpolate(mask, scale_factor=1./self.rate, mode='nearest') int_ms = list(mask.size()) # m shape: [N, C*k*k, L] m = utils.extract_image_patches(mask, ksizes=[self.ksize, self.ksize], strides=[self.stride, self.stride], rates=[1, 1], padding='same') # m shape: [N, C, k, k, L] m = m.view(int_ms[0], int_ms[1], self.ksize, self.ksize, -1) m = m.permute(0, 4, 1, 2, 3) # m shape: [N, L, C, k, k] m = m[0] # m shape: [L, C, k, k] # mm shape: [L, 1, 1, 1] mm = (utils.reduce_mean(m, axis=[1, 2, 3], keepdim=True)==0.).to(torch.float32) mm = mm.permute(1, 0, 2, 3) # mm shape: [1, L, 1, 1] y = [] offsets = [] k = self.fuse_k scale = self.softmax_scale # to fit the PyTorch tensor image value range fuse_weight = torch.eye(k).view(1, 1, k, k) # 1*1*k*k if self.use_cuda: fuse_weight = fuse_weight.cuda() for xi, wi, raw_wi in zip(f_groups, w_groups, raw_w_groups): ''' O => output channel as a conv filter I => input channel as a conv filter xi : separated tensor along batch dimension of front; (B=1, C=128, H=32, W=32) wi : separated patch tensor along batch dimension of back; (B=1, O=32*32, I=128, KH=3, KW=3) raw_wi : separated tensor along batch dimension of back; (B=1, I=32*32, O=128, KH=4, KW=4) ''' # conv for compare escape_NaN = torch.FloatTensor([1e-4]) if self.use_cuda: escape_NaN = escape_NaN.cuda() wi = wi[0] # [L, C, k, k] max_wi = torch.sqrt(utils.reduce_sum(torch.pow(wi, 2) + escape_NaN, axis=[1, 2, 3], keepdim=True)) wi_normed = wi / max_wi # xi shape: [1, C, H, W], yi shape: [1, L, H, W] xi = utils.same_padding(xi, [self.ksize, self.ksize], [1, 1], [1, 1]) # xi: 1*c*H*W yi = F.conv2d(xi, wi_normed, stride=1) # [1, L, H, W] # conv implementation for fuse scores to encourage large patches if self.fuse: # make all of depth to spatial resolution yi = yi.view(1, 1, int_bs[2]*int_bs[3], int_fs[2]*int_fs[3]) # (B=1, I=1, H=32*32, W=32*32) yi = utils.same_padding(yi, [k, k], [1, 1], [1, 1]) yi = F.conv2d(yi, fuse_weight, stride=1) # (B=1, C=1, H=32*32, W=32*32) yi = yi.contiguous().view(1, int_bs[2], int_bs[3], int_fs[2], int_fs[3]) # (B=1, 32, 32, 32, 32) yi = yi.permute(0, 2, 1, 4, 3) yi = yi.contiguous().view(1, 1, int_bs[2]*int_bs[3], int_fs[2]*int_fs[3]) yi = utils.same_padding(yi, [k, k], [1, 1], [1, 1]) yi = F.conv2d(yi, fuse_weight, stride=1) yi = yi.contiguous().view(1, int_bs[3], int_bs[2], int_fs[3], int_fs[2]) yi = yi.permute(0, 2, 1, 4, 3).contiguous() yi = yi.view(1, int_bs[2] * int_bs[3], int_fs[2], int_fs[3]) # (B=1, C=32*32, H=32, W=32) # softmax to match yi = yi * mm yi = F.softmax(yi*scale, dim=1) yi = yi * mm # [1, L, H, W] offset = torch.argmax(yi, dim=1, keepdim=True) # 1*1*H*W if int_bs != int_fs: # Normalize the offset value to match foreground dimension times = float(int_fs[2] * int_fs[3]) / float(int_bs[2] * int_bs[3]) offset = ((offset + 1).float() * times - 1).to(torch.int64) offset = torch.cat([offset//int_fs[3], offset%int_fs[3]], dim=1) # 1*2*H*W # deconv for patch pasting wi_center = raw_wi[0] # yi = F.pad(yi, [0, 1, 0, 1]) # here may need conv_transpose same padding yi = F.conv_transpose2d(yi, wi_center, stride=self.rate, padding=1) / 4. # (B=1, C=128, H=64, W=64) y.append(yi) offsets.append(offset) y = torch.cat(y, dim=0) # back to the mini-batch y.contiguous().view(raw_int_fs) return y
def forward(self, f, b, mask=None, ksize=3, stride=1, rate=1, fuse_k=3, softmax_scale=10., training=True, fuse=True): """ Contextual attention layer implementation. Contextual attention is first introduced in publication: Generative Image Inpainting with Contextual Attention, Yu et al. Args: f: Input feature to match (foreground). b: Input feature for match (background). mask: Input mask for b, indicating patches not available. ksize: Kernel size for contextual attention. stride: Stride for extracting patches from t. rate: Dilation for matching. softmax_scale: Scaled softmax for attention. training: Indicating if current graph is training or inference. Returns: tf.Tensor: output """ # get shapes of foreground (f) and background (b) raw_fs = f.shape # print("RAW FS: " + str(raw_fs)) raw_int_fs = list(f.shape) raw_int_bs = list(b.shape) # extract 3x3 patches from background with stride and rate kernel = 2 * rate raw_w = self.extract_image_patches(b, kernel, rate * stride) # Reshape raw_w to match pytorch conv weights shape raw_w = torch.reshape( raw_w, [raw_int_bs[0], -1, raw_int_bs[1], kernel, kernel ]) # b x in_ch (h * w) x out_ch (c) x k x k # downscaling foreground option: downscaling both foreground and # background for matching and use original background for reconstruction. f = F.interpolate(f, scale_factor=1. / rate, mode='nearest') b = F.interpolate( b, size=[int(raw_int_bs[2] / rate), int(raw_int_bs[3] / rate)], mode='nearest') # get shape of foreground then split on the batch dimension fs = f.shape int_fs = list(f.shape) f_groups = torch.split(f, 1, dim=0) # print("F GROUPS: " + str(f_groups[0].shape)) bs = b.shape int_bs = list(b.shape) # extract w then reshape to weight shape of functional conv2d of pytorch w = self.extract_image_patches(b, ksize, stride) # reshape to b x in_ch (h * w) x out_ch (c) x k x k # print("INT FS: " + str(int_fs)) w = torch.reshape(w, [int_fs[0], -1, int_fs[1], ksize, ksize]) # print("W: " + str(w.shape)) # process mask if mask is None: mask = torch.zeros([bs[0], 1, bs[2], bs[3]]).cuda() else: # print("DOWNSAMPLE MEN") mask = F.interpolate(mask, scale_factor=1. / rate, mode='nearest') m = self.extract_image_patches(mask, ksize, stride) # make mask have the shape of (b x c x hw x k x k) # print("m = " + str(mask.shape)) if (mask.shape[0] > 1): m = torch.reshape(m, [mask.shape[0], 1, -1, ksize, ksize]) else: m = torch.reshape(m, [1, 1, -1, ksize, ksize]) # m = m[0] # print("MY M: " + str(m.shape)) # create batch for mm mm = [] for i in range(m.shape[0]): mm.append(utils.reduce_mean(m[i], axis=[0, 2, 3], keep_dims=True)) mm = torch.cat(mm) # print("mm: " + str(mm.shape)) w_groups = torch.split(w, 1, dim=0) raw_w_groups = torch.split(raw_w, 1, dim=0) y = [] offsets = [] k = fuse_k scale = softmax_scale fuse_weight = utils.to_var(torch.reshape(torch.eye(k), [1, 1, k, k])) for xi, wi, raw_wi, mi in zip(f_groups, w_groups, raw_w_groups, mm): """ # Conv per batch # VARIABLES: # - xi: input to the conv; tensors from foreground (f_groups) # - wi: weights for training; image patches from the background (w_groups): # - raw_wi: patches from the background (raw_w_groups) """ # conv for compare wi = wi[0] # wi_normed = wi / \ torch.max(torch.sqrt(utils.reduce_sum( wi ** 2, axis=[0, 2, 3])), torch.FloatTensor([1e-4]).cuda()) # print("wi_normed: " + str(wi_normed.shape)) # print("xi:" + str(xi.shape)) yi = F.conv2d(xi, wi_normed, stride=1, padding=1) # print("yi: " + str(yi.shape)) # wi_normed = wi / torch.max(torch.sqrt(torch.sum(torch.square()))) #l2 norm # conv implementation for fuse scores to encourage large patches if fuse: # b x c x f(hw) x b(hw) yi = torch.reshape(yi, [1, 1, fs[2] * fs[3], bs[2] * bs[3]]) # print("yi: " + str(yi.shape)) yi = F.conv2d(yi, fuse_weight, stride=1, padding=1) yi = torch.reshape(yi, [1, fs[2], fs[3], bs[2], bs[3]]) yi = yi.permute(0, 2, 1, 4, 3) yi = torch.reshape(yi, [1, 1, fs[2] * fs[3], bs[2] * bs[3]]) # print("yi: " + str(yi.shape)) yi = F.conv2d(yi, fuse_weight, stride=1, padding=1) yi = torch.reshape(yi, [1, fs[3], fs[2], bs[3], bs[2]]) yi = yi.permute(0, 2, 1, 4, 3) # print("yi inside fuse: " + str(yi.shape)) # print("yi: " + str(yi.shape)) yi = torch.reshape(yi, [1, bs[2] * bs[3], fs[2], fs[3]]) # print("yi: " + str(yi.shape)) # softmax to match yi = yi * mi # print("hey") yi = F.softmax(yi * scale, dim=1) yi = yi * mi # mask _, offset = torch.max(yi, dim=1) offset = torch.stack([offset // fs[3], offset % fs[3]], dim=-1) # deconv for patch pasting # 3.1 paste center wi_center = raw_wi[0] yi = F.conv_transpose2d(yi, wi_center, stride=rate, padding=1) / 4. y.append(yi) offsets.append(offset) y = torch.cat(y, dim=0) offsets = torch.cat(offsets, dim=0) offsets = torch.reshape(offsets, [int_bs[0]] + [2] + int_bs[2:]) # skip channel # case1: visualize optical flow: minus current position # height h_add = utils.to_var( torch.reshape(torch.arange(bs[2]), [1, 1, bs[2], 1])) h_add = h_add.expand([bs[0], 1, bs[2], bs[3]]) # width w_add = utils.to_var( torch.reshape(torch.arange(bs[3]), [1, 1, 1, bs[3]])) w_add = w_add.expand([bs[0], 1, bs[2], bs[3]]) # concat on channel offsets = offsets - torch.cat([h_add, w_add], dim=1) # to flow image flow = helper.flow_to_image( offsets.permute(0, 2, 3, 1).data.cpu().numpy()) flow = torch.from_numpy(flow).permute(0, 3, 1, 2) # case2: visualize which pixels are attended # flow = highlight_flow_tf(offsets * tf.cast(mask, tf.int32)) if rate != 1: flow = F.interpolate(flow, scale_factor=rate, mode='nearest') out = self.final_layers(y) return out, flow
def routing(input, b_IJ): '''Arg: input_tensor: [batch_size, num_caps_l = 1152, 1, len(u_i)=8, 1] Return: [batch_size, num_caps_l_plus_one, len(v_j)=16, 1] u_i represents the vector output of capsule i in the layer l, and v_j the vector output of capsule j in the layer l+1. ''' # W: [1, num_caps_i, num_caps_j*len_v_j, len_u_j, 1] W = tf.get_variable( 'Weight', shape=(1, 1152, 160, 8, 1), dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=cfg.stddev)) biases = tf.get_variable('bias', shape=(1, 1, 10, 16, 1)) # cal u_hat ''' Since tf.matmul is a time-consuming op, A better solution is using element-wise multiply, reduce_sum and reshape ops instead. Matmul [a, b] x [b, c] is equal to a series ops as element-wise multiply [a*c, b] * [a*c, b], reduce_sum at axis=1 and reshape to [a, c] ''' ''' tf.tile create a new tensor by replicating input multiples times output tensor's i_th dimension has input.dims(i)*multiples[i] elements and the value of input are replicated multiples[i] times along the i_th dimension Example: [a b c d] by [2] output [a b c d a b c d] ''' # input_tensor: [batch_size, num_caps_l = 1152, 1, len(u_i)=8, 1] input = tf.tile(input, [1, 1, 160, 1, 1]) # Validate if input shape assert input.get_shape() == [cfg.batch_size, 1152, 160, 8, 1] u_hat = tf.reduce_sum(W * input, axis=3, keepdims=True) # Element-wise sum u_hat = tf.reshape(u_hat, shape=[-1, 1152, 10, 16, 1]) #check size assert u_hat.get_shape() == [cfg.batch_size, 1152, 10, 16, 1] # During forward pass, u_hat_stopped == u_hat # No update during backprop. no gradient pass either u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient') for r_iter in range(cfg.iter_routing): with tf.variable('iter_' + str(r_iter)): #[batch_size, 1152, 10, 1, 1] c_IJ = softmax(b_IJ, axis=2) if r_iter == cfg.iter_routing - 1: #last iteration: we use u_hat s_J = tf.multiply(c_IJ, u_hat) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases assert s_J.get_shape() == [cfg.batch_size, 1, 10, 16, 1] v_J = squash(s_J) assert v_J.get_shape() == [cfg.batch_size, 1, 10, 16, 1] elif r_iter < cfg.iter_routing - 1: s_J = tf.multiply(c_IJ, u_hat_stopped) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) # Reshape and tile v_J from [batch_size, 1, 10, 16, 1] # to match with u_hat_stopped: [batch_size, 1152, 10,16, 1] # b_IJ += u_hat_stopped^T * v_J v_J_tiled = tf.tile(v_J, [1, 1152, 1, 1, 1]) # v_J_tiled: [batch_size, 1152, 10,16, 1] # u_hat_stopped: [batch_size,1152,10,16,1] u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True) assert u_produce_v.get_shape() == [ cfg.batch_size, 1152, 10, 1, 1 ] # b_IJ += b_IJ += u_produce_v return (v_J)
def discriminator(input, isTrain=True, reuse=False): epsilon = 1e-9 if isTrain: with tf.variable_scope('discriminator') as scope: if reuse: labels = tf.constant(0, shape=[ cfg.batch_size, ]) else: labels = tf.constant(1, shape=[ cfg.batch_size, ]) Y = tf.one_hot(labels, depth=2, axis=1, dtype=tf.float32) X = input if reuse: scope.reuse_variables() with tf.variable_scope('Conv1_layer'): # Conv1, [batch_size, 20, 20, 256] conv1 = tf.contrib.layers.conv2d(X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) assert caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): """changing the num_outputs to 2 from 10""" digitCaps = CapsLayer(num_outputs=2, vec_len=16, with_routing=True, layer_type='FC') caps2 = digitCaps(caps1) v_length = tf.sqrt( reduce_sum(tf.square(caps2), axis=2, keepdims=True) + epsilon) """Loss """ max_l = tf.square(tf.maximum(0., cfg.m_plus - v_length)) # max_r = max(0, ||v_c||-m_minus)^2 max_r = tf.square(tf.maximum(0., v_length - cfg.m_minus)) """changing assert value to be [batch, 2, 1, 1] from [batch, 10, 1, 1]""" assert max_l.get_shape() == [cfg.batch_size, 2, 1, 1] # reshape: [batch_size, 10, 1, 1] => [batch_size, 10] max_l = tf.reshape(max_l, shape=(cfg.batch_size, -1)) max_r = tf.reshape(max_r, shape=(cfg.batch_size, -1)) # calc T_c: [batch_size, 10] # T_c = Y, is my understanding correct? Try it. T_c = Y # [batch_size, 10], element-wise multiply L_c = T_c * max_l + cfg.lambda_val * (1 - T_c) * max_r margin_loss = tf.reduce_mean(tf.reduce_sum(L_c, axis=1)) return margin_loss
def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1, [batch_size, 20, 20, 256] conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) assert caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=10, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) # Decoder structure in Fig. 2 # 1. Do masking, how: with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) assert self.softmax_v.get_shape() == [cfg.batch_size, 10, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: # self.masked_v = tf.matmul(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1)), transpose_a=True) self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1))) self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) assert fc1.get_shape() == [cfg.batch_size, 512] fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024) assert fc2.get_shape() == [cfg.batch_size, 1024] self.decoded = tf.contrib.layers.fully_connected(fc2, num_outputs=784, activation_fn=tf.sigmoid)
def __init__(self, is_training = True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.X, self.labels = get_batch_data(cfg.dataset, cfg.batch_size, cfg.num_threads) self.Y = tf.one_hot(self.labels, depth = 10, axis = 1, dtype = tf.float32) #depth = 10 for 10 classes self.build_arch() self.loss() self.summary() self.global_step = tf.Variable(0,name='global_step',trainable=False) self.optimizer = tf.train.AdamOptimizer() self.train_op = self.optimizer.minimize(self.total_loss, global_step = self.global_step) else: #Which is either Testing or Validation self.X = tf.placeholder(tf.float32, shape = (cfg.batch_size,28,28,1)) # 28 by 28 pixel and 1 channel self.labels = tf.placeholder(tf.int32, shape = (cfg.batch_size, )) self.Y = tf.reshape(self.labels, shape = (cfg.batch_size, 10, 1)) self.build_arch() tf.logging.info('Seting up the main structure') def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1_layer: # Input [batch_size, 20, 20, 256] conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Cap Layer # Output: [batch_size, 6, 6, 32, 8-Dim tensor] # i.e: [cfg.batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1,kernel_size=9,stride=2) assert caps1.get_shape() == [cfg.batch_size,1152,8,1] with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=10, vec_len=16,with_routing=True,layer_type='FC') self.caps2 = digitCaps(caps1) # Don't understand # REVIEW WHAT's MASKING with tf.variable_scope('Masking'): # calculate ||v_c||, then softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2),axis=2,keepdims=True)+epsilon) self.softmax_v = softmax(self.v_length, axis=1) assert self.softmax_v == [cfg.batch_size, 10, 1, 1] # Pick the index with the max softmax val of the 10 caps # [batch_size, 10, 1 ,1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1,1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # WHAT's MASK WITH Y if not cfg.mask_with_y: # indexing masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps[batch_size][se;f.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v,shape=(1,1,16,1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] else: # MASK WITH TRUE LABEL self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y,(-1,10,1))) self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2),axis=2,keepdims=True)+epsilon) with tf.variable_scope('Decoder'):
def build_arch(self): with tf.variable_scope('Conv1_layer'): conv1 = contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding="VALID") assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) assert caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=10, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) # Decoder structure in Fig. 2 # 1. Do masking, how: with tf.variable_scope("Masking"): self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) assert self.softmax_v.get_shape() == [cfg.batch_size, 10, 1, 1] self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default model else: self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1))) self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # 2. Reconstruct the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): fully_connected = contrib.layers.fully_connected vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) fc1 = fully_connected(vector_j, num_outputs=512) assert fc1.get_shape() == [cfg.batch_size, 512] fc2 = fully_connected(fc1, num_outpus=1024) assert fc2.get_shape() == [cfg.batch_size, 1024] self.decoded = fully_connected(fc2, num_outputs=784, activation_fn=tf.sigmoid)
def squash(vector): vec_squared_norm = reduce_sum(tf.square(vector), -2, keepdims=True) scalar_factor = vec_squared_norm / ( 1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + epsilon) vec_squashed = scalar_factor * vector # element-wise return (vec_squashed)
def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1, [batch_size, 20, 20, 256] conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) assert caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): """changing the num_outputs to 2 from 10""" digitCaps = CapsLayer(num_outputs=2, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) # Decoder structure in Fig. 2 # 1. Do masking, how: """since we have only two output capsules, we don't need masking because we are not using any reconstruction thus commenting:""" with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) """changing assert value to be [batch, 2, 1, 1] from [batch, 10, 1, 1]""" assert self.softmax_v.get_shape() == [cfg.batch_size, 2, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: # self.masked_v = tf.matmul(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1)), transpose_a=True) self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1))) self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) assert fc1.get_shape() == [cfg.batch_size, 512] fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024) assert fc2.get_shape() == [cfg.batch_size, 1024] self.decoded = tf.contrib.layers.fully_connected( fc2, num_outputs=784, activation_fn=tf.sigmoid)
def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1, return tensor with shape [batch_size, 20, 20, 256],第一层 卷积层输入:28x28图像(单色)输出:20x20x256张量 ''' 第一层 卷积层 输入:28x28图像(单色) 输出:20x20x256张量 参数:20992 卷积层检测2D图像的基本特征。在CapsNet中,卷积层有256个步长为1的9x9x1核,使用ReLU激活。 ''' conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') # print("第一次cnn",conv1) with tf.variable_scope('PrimaryCaps_layer'): # Primary Capsules layer, return tensor with shape [batch_size, 1152, 8, 1] ''' 第二层 PrimaryCaps层 输入:20x20x256张量 输出:6x6x8x32张量 参数:5308672 这一层包含32个主胶囊,接受卷积层检测到的基本特征,生成特征的组合。这一层的32个主胶囊本质上和卷积层很相似。 每个胶囊将8个9x9x256卷积核应用到20x20x256输入张量,因而生成6x6x8输出张量。 由于总共有32个胶囊,输出为6x6x8x32张量。 ''' primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) # print("第二层 PrimaryCaps层",caps1) with tf.variable_scope('DigitCaps_layer'): # DigitCaps layer, return shape [batch_size, 10, 16, 1] ''' 第三层 DigitCaps层 输入:6x6x8x32张量 输出:16x10矩阵 参数:1497600 这一层包含10个数字胶囊,每个胶囊对应一个数字。每个胶囊接受一个6x6x8x32张量作为输入。你可以把它看成6x6x32的8维向量,也就是1152输入向量。在胶囊内部,每个输入向量通过8x16权重矩阵将8维输入空间映射到16维胶囊输出空间。 ''' digitCaps = CapsLayer(num_outputs=self.num_label, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) # print("第三层 DigitCaps层",self.caps2) # Decoder structure in Fig. 2 # 1. Do masking, how: with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # print("self.v_length",self.v_length) #计算 v 向量的模 self.softmax_v = softmax(self.v_length, axis=1) # print("self.softmax_v",self.softmax_v) # 对每个低层胶囊i而言,所有权重cij的总和等于1。 # assert self.softmax_v.get_shape() == [cfg.batch_size, self.num_label, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) # print("self.argmax_idx",self.argmax_idx) # 获取最佳的预测id # assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # print("self.argmax_idx",self.argmax_idx) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] # print("v",v) masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) # print("self.masked_v",self.masked_v ) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: self.masked_v = tf.multiply( tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, self.num_label, 1))) self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # print("self.masked_v2",self.masked_v) # print("self.v_length2",self.v_length) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) ''' 第四层 第一全连接层 输入:16x10 输出:512 参数:82432 低层的每个输出加权后传导至全连接层的每个神经元作为输入。每个神经元同时具备一个偏置项。 16x10输入全部传导至这一层的512个神经元中的每个神经元。 ''' fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024) ''' 第五层 第二全连接层 输入:512 输出:1024 参数:525312 ''' self.decoded = tf.contrib.layers.fully_connected( fc2, num_outputs=self.height * self.width * self.channels, activation_fn=tf.sigmoid) '''
def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1, [batch_size, 20, 20, 256] self.W = tf.get_variable( 'W', shape=[9, 9, 1, 256], initializer=tf.contrib.layers.xavier_initializer()) self.W = fix(self.W) self.biases = tf.get_variable('biases', shape=[256], initializer=tf.zeros_initializer()) self.biases = fix(self.biases) self.conv1 = tf.nn.relu( tf.nn.conv2d( self.X, self.W, strides=[1, 1, 1, 1], padding='VALID') + self.biases) self.conv1 = fix(self.conv1) assert self.conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): self.primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') self.caps1 = self.primaryCaps(self.conv1, kernel_size=9, stride=2) assert self.caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): self.digitCaps = CapsLayer(num_outputs=10, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = self.digitCaps(self.caps1) # Decoder structure in Fig. 2 # 1. Do masking, how: with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) assert self.softmax_v.get_shape() == [cfg.batch_size, 10, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: # self.masked_v = tf.matmul(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1)), transpose_a=True) self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1))) self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) self.fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) assert self.fc1.get_shape() == [cfg.batch_size, 512] self.fc2 = tf.contrib.layers.fully_connected(self.fc1, num_outputs=1024) assert self.fc2.get_shape() == [cfg.batch_size, 1024] self.decoded = tf.contrib.layers.fully_connected( self.fc2, num_outputs=784, activation_fn=tf.sigmoid)
def routing(vote, activation=None, num_outputs=32, out_caps_shape=[4, 4], method='EMRouting', num_iter=3, regularizer=None): ''' Routing-by-agreement algorithm. Args: alias H = out_caps_shape[0]*out_caps_shape[1]. vote: [batch_size, num_inputs, num_outputs, H]. activation: [batch_size, num_inputs, 1, 1]. num_outputs: ... out_caps_shape: ... method: method for updating coupling coefficients between vote and pose['EMRouting', 'DynamicRouting']. num_iter: the number of routing iteration. regularizer: A (Tensor -> Tensor or None) function; the result of applying it on a newly created variable will be added to the collection tf.GraphKeys.REGULARIZATION_LOSSES and can be used for regularization. Returns: pose: [batch_size, 1, 1, num_outputs] + out_caps_shape. activation: [batch_size, 1, 1, num_outputs]. ''' if num_iter == 0: # no dynamic routing s = reduce_sum(vote, axis=1, keepdims=True) pose = squash(s) return pose, activation vote_stopped = tf.stop_gradient(vote, name="stop_gradient") batch_size = vote.shape[0].value if method == 'EMRouting': shape = vote.get_shape().as_list()[:3] + [1] # R: [batch_size, num_inputs, num_outputs, 1] R = tf.constant(np.ones(shape, dtype=np.float32) / num_outputs) for t_iter in range(num_iter): with tf.variable_scope('M-STEP') as scope: if t_iter > 0: scope.reuse_variables() # It's no need to do the `E-STEP` in the last iteration if t_iter == num_iter - 1: pose, stddev, activation_prime = M_step( R, activation, vote) break else: pose, stddev, activation_prime = M_step( R, activation, vote_stopped) with tf.variable_scope('E-STEP'): R = E_step(pose, stddev, activation_prime, vote_stopped) pose = tf.reshape(pose, shape=[batch_size, 1, 1, num_outputs] + out_caps_shape) activation = tf.reshape(activation_prime, shape=[batch_size, 1, 1, -1]) return (pose, activation) elif method == 'DynamicRouting': B = tf.constant( np.zeros([batch_size, vote.shape[1].value, num_outputs, 1, 1], dtype=np.float32)) for r_iter in range(num_iter): with tf.variable_scope('iter_' + str(r_iter)): coef = softmax(B, axis=2) if r_iter == num_iter - 1: s = reduce_sum(tf.multiply(coef, vote), axis=1, keepdims=True) pose = squash(s) else: s = reduce_sum(tf.multiply(coef, vote_stopped), axis=1, keepdims=True) pose = squash(s) shape = [batch_size, vote.shape[1].value, num_outputs ] + out_caps_shape pose = tf.multiply(pose, tf.constant(1., shape=shape)) B += tf.matmul(vote_stopped, pose, transpose_a=True) return (pose, activation) else: raise Exception('Invalid routing method!', method)
def build_arch(self): with tf.variable_scope('Conv1_layer'): # Conv1, [batch_size, 20, 20, 256] conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=9, stride=1, padding='VALID') assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) assert caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=10, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) assert self.softmax_v.get_shape() == [cfg.batch_size, 10, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] #masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) #self.masked_v = tf.concat(masked_v, axis=0) # assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: # self.masked_v = tf.matmul(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1)), transpose_a=True) #self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1))) self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) with tf.variable_scope('acc'): self.labels = tf.to_int32(tf.argmax(self.Y, axis=1)) correct_prediction = tf.equal(tf.to_int32(self.labels), self.argmax_idx) self.accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) * 100
def routing(l_input, b_IJ, num_outputs=10, num_dims=16): """ :param l_input: A Tensor with [batch_size, num_caps_l=1152, 1, length(u_i)=8, 1] shape, num_caps_l是前一层输出的capsule的数量 :param b_IJ: A Tensor whth [batch_size,num_caps_l,num_caps_l_plus_1,1,1] shape, 代表两层的capsule的关系,是不是向量的方向? :param num_outputs: 本层输出的capsule的数量 :param num_dims: capsule的维度 :return: A Tensor of shape [batch_size, num_caps_l,num_caps_l_plus_1, length(v_j)=16, 1] representing the vector output `v_j` in the layer l+1 Notes: u_i represents the vector output of capsule i in the layer l v_j represents the vector output of capsule j in the layer l+1. 矩阵相乘操作tf.matmul比较耗费时间,可以用一系列操作代替。[a,b]@[b,c]等同于以下操作: (1)[a,b]--->[a*c,b],用np.tile或tp.tile实现 (2)[b,c]--->[b,c*a]--->转置成[c*a,b] (3)[a*c,b]*[c*a,b] (4)reduce_sum at axis = 1 (5) reshape to [a,c] """ input_shape = get_shape(l_input) W = tf.get_variable( 'Weight', shape=[1, input_shape[1], num_dims * num_outputs] + input_shape[-2:], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=cfg.stddev)) biases = tf.get_variable('bias', shape=(1, 1, num_outputs, num_dims, 1)) l_input = tf.tile(l_input, [1, 1, num_dims * num_outputs, 1, 1]) """ W的形状是[1,1152,160,8,1],代表它要表达每张图片1152个输入capsule与160个输出capsule的向量值的关系 input的形状是[128,1152,1,8,1],代表的是128张图片,每张图片输出1152个capsule,每个capsule的维数的长度是8 input记录第l层的每个capsule的具体取值 u_hat的形状是[128,1152,160,1,1]或者[128,1152,10,16,1], 代表128张图片,每张图片中,第l层的每个capsule对应第l+1层的capsule的向量值,只记录第l层的capsule的个数,不记录取值 """ u_hat = reduce_sum(W * l_input, axis=3, keepdims=True) assert u_hat.get_shape() == [128, 1152, 160, 1, 1] u_hat = tf.reshape(u_hat, shape=[-1, input_shape[1], num_outputs, num_dims, 1]) assert u_hat.get_shape() == [128, 1152, 10, 16, 1] # assert u_hat.get_shape() == [cfg.batch_size, 1152, 10, 16, 1] # In forward, u_hat_stopped = u_hat; in backward, no gradient passed back from u_hat_stopped to u_hat u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient') # line 3,for r iterations do for r_iter in range(cfg.iter_routing): with tf.variable_scope('iter_' + str(r_iter)): # line 4: # => [batch_size, 1152, 10, 1, 1] c_IJ = softmax(b_IJ, axis=2) # At last iteration, use `u_hat` in order to receive gradients from the following graph if r_iter == cfg.iter_routing - 1: # line 5: # weighting u_hat with c_IJ, element-wise in the last two dims # => [batch_size, 1152, 10, 16, 1] s_J = tf.multiply(c_IJ, u_hat) # then sum in the second dim, resulting in [batch_size, 1, 10, 16, 1] s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases # assert s_J.get_shape() == [cfg.batch_size, 1, num_outputs, num_dims, 1] # line 6: # squash using Eq.1, v_J = squash(s_J) # assert v_J.get_shape() == [cfg.batch_size, 1, 10, 16, 1] elif r_iter < cfg.iter_routing - 1: # Inner iterations, do not apply backpropagation s_J = tf.multiply(c_IJ, u_hat_stopped) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) # line 7: # reshape & tile v_j from [batch_size ,1, 10, 16, 1] to [batch_size, 1152, 10, 16, 1] # then matmul in the last tow dim: [16, 1].T x [16, 1] => [1, 1], reduce mean in the # batch_size dim, resulting in [1, 1152, 10, 1, 1] v_J_tiled = tf.tile(v_J, [1, input_shape[1], 1, 1, 1]) u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True) # assert u_produce_v.get_shape() == [cfg.batch_size, 1152, 10, 1, 1] # b_IJ += tf.reduce_sum(u_produce_v, axis=0, keep_dims=True) b_IJ += u_produce_v return (v_J)
def routing(input, b_IJ): ''' The routing algorithm. Args: input: A Tensor with [batch_size, num_caps_l=128, 1, length(u_i)=8, 1] shape, num_caps_l meaning the number of capsule in the layer l. Returns: A Tensor of shape [batch_size, num_caps_l_plus_1, length(v_j)=16, 1] representing the vector output `v_j` in the layer l+1 Notes: u_i represents the vector output of capsule i in the layer l, and v_j the vector output of capsule j in the layer l+1. ''' # W: [1, num_caps_i, num_caps_j * len_v_j, len_u_j, 1] W = tf.get_variable('Weight', shape=(1, 128, 160, 8, 1), dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=cfg.stddev)) biases = tf.get_variable('bias', shape=(1, 1, 10, 16, 1)) # Eq.2, calc u_hat # Since tf.matmul is a time-consuming op, # A better solution is using element-wise multiply, reduce_sum and reshape # ops instead. Matmul [a, b] x [b, c] is equal to a series ops as # element-wise multiply [a*c, b] * [a*c, b], reduce_sum at axis=1 and # reshape to [a, c] input = tf.tile(input, [1, 1, 160, 1, 1]) #assert input.get_shape() == [cfg.batch_size, 128, 160, 8, 1] u_hat = reduce_sum(W * input, axis=3, keepdims=True) u_hat = tf.reshape(u_hat, shape=[-1, 128, 10, 16, 1]) assert u_hat.get_shape() == [cfg.batch_size, 128, 10, 16, 1] # In forward, u_hat_stopped = u_hat; in backward, no gradient passed back from u_hat_stopped to u_hat u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient') # line 3,for r iterations do for r_iter in range(cfg.iter_routing): with tf.variable_scope('iter_' + str(r_iter)): # line 4: # => [batch_size, 128, 10, 1, 1] c_IJ = softmax(b_IJ, axis=2) # At last iteration, use `u_hat` in order to receive gradients from the following graph if r_iter == cfg.iter_routing - 1: # line 5: # weighting u_hat with c_IJ, element-wise in the last two dims # => [batch_size, 128, 10, 16, 1] s_J = tf.multiply(c_IJ, u_hat) # then sum in the second dim, resulting in [batch_size, 1, 10, 16, 1] s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases assert s_J.get_shape() == [cfg.batch_size, 1, 10, 16, 1] # line 6: # squash using Eq.1, v_J = squash(s_J) assert v_J.get_shape() == [cfg.batch_size, 1, 10, 16, 1] elif r_iter < cfg.iter_routing - 1: # Inner iterations, do not apply backpropagation s_J = tf.multiply(c_IJ, u_hat_stopped) s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases v_J = squash(s_J) # line 7: # reshape & tile v_j from [batch_size ,1, 10, 16, 1] to [batch_size, 128, 10, 16, 1] # then matmul in the last tow dim: [16, 1].T x [16, 1] => [1, 1], reduce mean in the # batch_size dim, resulting in [1, 128, 10, 1, 1] v_J_tiled = tf.tile(v_J, [1, 128, 1, 1, 1]) u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True) assert u_produce_v.get_shape() == [cfg.batch_size, 128, 10, 1, 1] # b_IJ += tf.reduce_sum(u_produce_v, axis=0, keep_dims=True) b_IJ += u_produce_v return(v_J)
def cnn(self): """CNN模型""" embedding_inputs = self.input_embedding() filter_sizes = [[1, 300], [2, 300], [3, 300], [5, 300]] global all_conv for i, filter_size in enumerate(filter_sizes): with tf.name_scope("cnn%s" % filter_size[0]): # filter_shape=[filter_size[0],cfg.embedding_dim,1,cfg.num_filters] filter_shape = [filter_size[0], cfg.embedding_dim, 1, filter_size[1]] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W') conv = tf.nn.conv2d( embedding_inputs, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") conv = tf.reshape(conv, shape=[-1, filter_size[1], conv.shape[1], 1]) if i == 0: all_conv = conv else: all_conv = tf.concat([all_conv, conv], axis=2) digitCaps = CapsLayer(num_outputs=cfg.num_classes, vec_len=cfg.vec_len, with_routing=True, layer_type='FC') self.caps2 = digitCaps(all_conv) print("self.caps2",self.caps2) # self.cap_flatten=tf.reshape(self.caps2,[-1,cfg.num_classes*cfg.vec_len]) #映射成一个 num_filters_total 维的特征向量 # print("self.cap_flatten", self.cap_flatten.shape) with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # print("self.v_length",self.v_length) # 计算 v 向量的模 self.softmax_v = softmax(self.v_length, axis=1) # print("self.softmax_v",self.softmax_v) # 对每个低层胶囊i而言,所有权重cij的总和等于1。 # assert self.softmax_v.get_shape() == [cfg.batch_size, self.num_label, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) # print("self.argmax_idx",self.argmax_idx) # 获取最佳的预测id # assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size,)) # print("self.argmax_idx",self.argmax_idx) # Method 1. if not cfg.mask_with_y: self.masked_v=tf.reshape(self.caps2,(-1,cfg.num_classes,cfg.vec_len)) # # c). indexing # # It's not easy to understand the indexing process with argmax_idx # # as we are 3-dim animal # masked_v = [] # for batch_size in range(cfg.batch_size): # v = self.caps2[batch_size][self.argmax_idx[batch_size], :] # # print("v",v) # masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) # # self.masked_v = tf.concat(masked_v, axis=0) # # print("self.masked_v",self.masked_v ) # assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.input_y, (-1, cfg.num_classes, 1))) ''' 请注意,它在训练时仅使用正确的DigitCap向量,忽略不正确的DigitCap,取出正确的DigitCap向量 ''' self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) print("self.masked_v2", self.masked_v) # print("self.v_length2",self.v_length) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.name_scope("score"): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) self.logits = tf.layers.dense(vector_j, cfg.num_classes, name='fc2') # self.y_pred = tf.contrib.layers.fully_connected(vector_j, # num_outputs=cfg.num_classes, # activation_fn=tf.sigmoid) # 输出层,分类器 # self.logits = tf.layers.dense(cur_layer, cfg.num_classes, name='fc2') self.logits_softmax = tf.nn.softmax(self.logits) # self.logits1 = tf.nn.local_response_normalization(self.logits,dim = 0) # print("self.logits", self.logits.shape) self.y_pred = tf.argmax(self.logits_softmax, 1) # 预测类别 # print("self.y_pred",self.y_pred.shape) with tf.name_scope("loss"): # 使用优化方式,损失函数,交叉熵 # 1. The margin loss # [batch_size, 10, 1, 1] # max_l = max(0, m_plus-||v_c||)^2 max_l = tf.square(tf.maximum(0., cfg.m_plus - self.v_length)) # max_r = max(0, ||v_c||-m_minus)^2 max_r = tf.square(tf.maximum(0., self.v_length - cfg.m_minus)) ''' 当正确DigitCap预测正确标签的概率大于0.9时,损失函数为零,当概率小于0.9时,损失函数不为零。 ''' assert max_l.get_shape() == [cfg.batch_size, cfg.num_classes, 1, 1] # reshape: [batch_size, 10, 1, 1] => [batch_size, 10] max_l = tf.reshape(max_l, shape=(cfg.batch_size, -1)) max_r = tf.reshape(max_r, shape=(cfg.batch_size, -1)) # calc T_c: [batch_size, 10] # T_c = Y, is my understanding correct? Try it. T_c = self.input_y # [batch_size, 10], element-wise multiply L_c = T_c * max_l + cfg.lambda_val * (1 - T_c) * max_r self.margin_loss = tf.reduce_mean(tf.reduce_sum(L_c, axis=1)) # 2. The reconstruction loss # print("self.input_y", self.input_y) # orgin = tf.reshape(self.input_y, shape=(cfg.batch_size, -1)) # print("self.y_pred",self.y_pred) # print("orgin",orgin) squared = tf.square(self.logits_softmax - self.input_y) self.reconstruction_err = tf.reduce_mean(squared) # 3. Total loss # The paper uses sum of squared error as reconstruction error, but we # have used reduce_mean in `# 2 The reconstruction loss` to calculate # mean squared error. In order to keep in line with the paper,the # regularization scale should be 0.0005*10=0.005 self.loss = self.margin_loss + cfg.regularization_scale * self.reconstruction_err # cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) # self.loss = tf.reduce_mean(cross_entropy) with tf.name_scope("optimize"): # 优化器 self.optim = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate).minimize(self.loss) with tf.name_scope("accuracy"): correct_pred = tf.equal(self.y_pred, tf.argmax(self.input_y, 1)) self.acc = tf.reduce_mean(tf.cast(correct_pred, "float"), name="accuracy")
def build_arch(self): with tf.variable_scope('Test'): self.testConst = tf.constant(1.0, name='testConst') with tf.variable_scope('Conv1_layer'): # Conv1, [batch_size, 20, 20, 256] print('shape of self x : ', self.X.shape) conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256, kernel_size=cfg.image_size - 19, stride=1, padding='VALID') print('shape asdf asdf: ', conv1.get_shape()) assert conv1.get_shape() == [cfg.batch_size, 20, 20, 256] # Primary Capsules layer, return [batch_size, 1152, 8, 1] with tf.variable_scope('PrimaryCaps_layer'): primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(conv1, kernel_size=9, stride=2) assert caps1.get_shape() == [cfg.batch_size, 1152, 8, 1] # DigitCaps layer, return [batch_size, 10, 16, 1] with tf.variable_scope('DigitCaps_layer'): digitCaps = CapsLayer(num_outputs=10, vec_len=16, with_routing=True, layer_type='FC') self.caps2 = digitCaps(caps1) #### ASDF #### self.v_J = digitCaps.v_J self.W = digitCaps.W self.b_IJ = digitCaps.b_IJ self.s_J = digitCaps.s_J self.c_IJ = digitCaps.c_IJ self.u_hat = digitCaps.u_hat self.biases = digitCaps.biases #### END ASDF #### # Decoder structure in Fig. 2 # 1. Do masking, how: with tf.variable_scope('Masking'): # a). calc ||v_c||, then do softmax(||v_c||) # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1] self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) self.softmax_v = softmax(self.v_length, axis=1) assert self.softmax_v.get_shape() == [cfg.batch_size, 10, 1, 1] # b). pick out the index of max softmax val of the 10 caps # [batch_size, 10, 1, 1] => [batch_size] (index) self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1)) assert self.argmax_idx.get_shape() == [cfg.batch_size, 1, 1] self.argmax_idx = tf.reshape(self.argmax_idx, shape=(cfg.batch_size, )) # Method 1. if not cfg.mask_with_y: # c). indexing # It's not easy to understand the indexing process with argmax_idx # as we are 3-dim animal masked_v = [] for batch_size in range(cfg.batch_size): v = self.caps2[batch_size][self.argmax_idx[batch_size], :] masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1))) self.masked_v = tf.concat(masked_v, axis=0) assert self.masked_v.get_shape() == [cfg.batch_size, 1, 16, 1] # Method 2. masking with true label, default mode else: # self.masked_v = tf.matmul(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1)), transpose_a=True) self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, 10, 1))) self.v_length = tf.sqrt( reduce_sum(tf.square(self.caps2), axis=2, keepdims=True) + epsilon) # 2. Reconstructe the MNIST images with 3 FC layers # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512] with tf.variable_scope('Decoder'): vector_j = tf.reshape(self.masked_v, shape=(cfg.batch_size, -1)) fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512) assert fc1.get_shape() == [cfg.batch_size, 512] fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024) assert fc2.get_shape() == [cfg.batch_size, 1024] self.decoded = tf.contrib.layers.fully_connected( fc2, num_outputs=cfg.image_size_flatten, activation_fn=tf.sigmoid)
def forward(self, b, f, mask=None): """ :param b: Input feature for match (background) - known region. :param f: Input feature to match (foreground) - missing region. :param mask: Input mask for b, indicating patches not available. :return: """ # get shapes f_shape_raw = list(f.size()) # batch_size * c * h * w b_shape_raw = list(b.size()) # batch_size * c * h * w kernel_size = 2 * self.rate # extract patches from background with stride, padding and dilation # raw_w is extracted for reconstruction raw_w = self.extract_patches(b, kernel_size, self.rate * self.stride, self.dilation, padding='valid') # [batch_size, C*k*k, L] raw_w = raw_w.view(b_shape_raw[0], b_shape_raw[1], kernel_size, kernel_size, -1) raw_w = raw_w.permute(0, 4, 1, 2, 3) # b_weights shape: [batch_size, L, C, k, k] # tuple of tensors with size [L, C, k, k] with len = batch_size raw_w_groups = torch.split(raw_w, 1, dim=0) # downscaling foreground option: downscaling both foreground and # background for matching and use original background for reconstruction. f = F.interpolate(f, scale_factor=1. / self.rate, mode='nearest') b = F.interpolate(b, scale_factor=1. / self.rate, mode='nearest') f_shape = list(f.size()) # b*c*h*w b_shape = list(b.size()) # split tensors along the batch dimension # tuple of tensors with size [C, h, w] with len = batch_size f_groups = torch.split( f, 1, dim=0) # split tensors along the batch dimension # w shape: [N, C*k*k, L] w = self.extract_patches(b, self.ksize, self.stride, 1, padding='same') # w shape: [N, C, k, k, L] w = w.view(b_shape[0], b_shape[1], self.ksize, self.ksize, -1) w = w.permute(0, 4, 1, 2, 3) # w shape: [N, L, C, k, k] w_groups = torch.split(w, 1, dim=0) if mask is None: mask = torch.zeros(f_shape[0], 1, f_shape[2], f_shape[3]) if self.device is not None: mask = mask.to(self.device) else: mask_scale = mask.size()[3] // f_shape[3] # downscale to match f shape mask = F.interpolate(mask, scale_factor=1 / mask_scale, mode='nearest') # mask = F.avg_pool2d(mask, kernel_size=4, stride=mask_scale) m_shape = list(mask.size()) # c * h * w m = self.extract_patches(mask, self.ksize, self.stride, 1, padding='same') # [batch_size, k*k, L] m = m.view(m_shape[0], m_shape[1], self.ksize, self.ksize, -1) # [batch_size, 1, k, k, L] m = m.permute(0, 4, 1, 2, 3) # m shape: [batch_size, L, C, k, k] # m = m[0] # m shape: [L, C, k, k] # 0 for patches where all values are 0 # 1 for patches with non-zero mean # mm shape: [batch_size, L, 1, 1, 1] mm = (reduce_mean(m, axis=[2, 3, 4], keepdim=True) == 1.).to(torch.float32) # mm shape: [batch_size, 1, L, 1, 1] mm = mm.permute(0, 2, 1, 3, 4) y = [] offsets = [] k = self.fuse_k scale = self.softmax_scale # to fit the PyTorch tensor image value range # Diagonal matrix with shape k * k fuse_weight = torch.eye(k).view(1, 1, k, k) # 1*1*k*k if self.device: fuse_weight = fuse_weight.to(self.device) EPS = torch.FloatTensor([1e-4]).to(self.device) for xi, wi, raw_wi, mi in zip(f_groups, w_groups, raw_w_groups, mm): """ O => output channel as a conv filter I => input channel as a conv filter xi : separated tensor along batch dimension of front; (B=1, C=128, H=32, W=32) wi : separated patch tensor along batch dimension of back; (B=1, O=32*32, I=128, KH=3, KW=3) raw_wi : separated tensor along batch dimension of back; (B=1, I=32*32, O=128, KH=4, KW=4) """ # Normalizing weight tensor wi = wi.squeeze(0) wi_norm = torch.sqrt( reduce_sum(torch.pow(wi, 2) + EPS, axis=[1, 2, 3], keepdim=True)) wi_normed = wi / wi_norm # xi shape: [1, C, H, W], yi shape: [1, L, H, W] xi_pad = same_padding(xi.shape[0], xi.shape[1], [self.ksize, self.ksize], [1, 1], [1, 1]) yi = F.conv2d(xi, wi_normed, stride=1, padding=xi_pad) # [1, L, H, W] # conv implementation for fuse scores to encourage large patches if self.fuse: # make all of depth to spatial resolution # Convolution with diagonal shaped kernel №1 yi = yi.view(1, 1, b_shape[2] * b_shape[3], f_shape[2] * f_shape[3]) # (B=1, I=1, H=32*32, W=32*32) yi_pad = same_padding(yi.shape[0], yi.shape[1], [k, k], [1, 1], [1, 1]) yi = F.conv2d(yi, fuse_weight, stride=1, padding=yi_pad) # (B=1, C=1, H=32*32, W=32*32) # Convolution with diagonal shaped kernel №2 yi = yi.contiguous().view(1, b_shape[2], b_shape[3], f_shape[2], f_shape[3]) # (B=1, 32, 32, 32, 32) yi = yi.permute(0, 2, 1, 4, 3) yi = yi.contiguous().view(1, 1, b_shape[2] * b_shape[3], f_shape[2] * f_shape[3]) yi_pad = same_padding(yi.shape[0], yi.shape[1], [k, k], [1, 1], [1, 1]) yi = F.conv2d(yi, fuse_weight, stride=1, padding=yi_pad) yi = yi.contiguous().view(1, b_shape[3], b_shape[2], f_shape[3], f_shape[2]) yi = yi.permute(0, 2, 1, 4, 3).contiguous() yi = yi.view(1, b_shape[2] * b_shape[3], f_shape[2], f_shape[3]) # (B=1, C=32*32, H=32, W=32) # softmax to match yi = yi * mi yi = F.softmax(yi * scale, dim=1) yi = yi * mi # [1, L, H, W] offset = torch.argmax(yi, dim=1, keepdim=True) # 1*1*H*W if b_shape != f_shape: # Normalize the offset value to match foreground dimension times = float(f_shape[2] * f_shape[3]) / float( b_shape[2] * b_shape[3]) offset = ((offset + 1).float() * times - 1).to(torch.int64) offset = torch.cat([offset // f_shape[3], offset % b_shape[3]], dim=1) # 1*2*H*W # deconv for patch pasting wi_center = raw_wi[0] # yi = F.pad(yi, [0, 1, 0, 1]) # here may need conv_transpose same padding yi = F.conv_transpose2d(yi, wi_center, stride=self.rate, padding=1) / 4. # (B=1, C=128, H=64, W=64) y.append(yi) offsets.append(offset) y = torch.cat(y, dim=0) # back to the mini-batch y.contiguous().view(f_shape_raw) offsets = torch.cat(offsets, dim=0) offsets = offsets.view(f_shape[0], 2, *f_shape[2:]) # case1: visualize optical flow: minus current position h_add = torch.arange(f_shape[2]).view([1, 1, f_shape[2], 1]).expand( f_shape[0], -1, -1, f_shape[3]) w_add = torch.arange(f_shape[3]).view([1, 1, 1, f_shape[3]]).expand( f_shape[0], -1, f_shape[2], -1) ref_coordinate = torch.cat([h_add, w_add], dim=1) ref_coordinate = ref_coordinate.to(self.device) offsets = offsets - ref_coordinate flow = torch.from_numpy( self.flow_to_image(offsets.permute(0, 2, 3, 1).cpu().data.numpy())) / 255. flow = flow.permute(0, 3, 1, 2) flow = flow.to(self.device) if self.rate != 1: flow = F.interpolate(flow, scale_factor=self.rate * 4, mode='nearest') return y, flow