def _shortcut(input, residual): """Adds a shortcut between input and residual block and merges them with "sum" """ # Expand channels of shortcut to match residual. # Stride appropriately to match residual (width, height) # Should be int if network architecture is correctly configured. input_shape = K.int_shape(input) residual_shape = K.int_shape(residual) stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS])) stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS])) equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS] shortcut = input # 1 X 1 conv if shape is different. Else identity. if stride_width > 1 or stride_height > 1 or not equal_channels: shortcut = tf.layers.conv2d(input, filters=residual_shape[CHANNEL_AXIS], use_bias=False, kernel_size=(1, 1), strides=(stride_width, stride_height), padding="valid", kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer) return tf.add(shortcut, residual)
def attention_block(x, gating, inter_shape): shape_x = K.int_shape(x) shape_g = K.int_shape(gating) theta_x = layers.Conv2D(inter_shape, (2, 2), strides=(2, 2), padding='same')(x) # 16 shape_theta_x = K.int_shape(theta_x) phi_g = layers.Conv2D(inter_shape, (1, 1), padding='same')(gating) upsample_g = layers.Conv2DTranspose( inter_shape, (3, 3), strides=(shape_theta_x[1] // shape_g[1], shape_theta_x[2] // shape_g[2]), padding='same')(phi_g) # 16 concat_xg = layers.add([upsample_g, theta_x]) act_xg = layers.Activation('relu')(concat_xg) psi = layers.Conv2D(1, (1, 1), padding='same')(act_xg) sigmoid_xg = layers.Activation('sigmoid')(psi) shape_sigmoid = K.int_shape(sigmoid_xg) upsample_psi = layers.UpSampling2D(size=(shape_x[1] // shape_sigmoid[1], shape_x[2] // shape_sigmoid[2]))( sigmoid_xg) # 32 upsample_psi = expend_as(upsample_psi, shape_x[3]) y = layers.multiply([upsample_psi, x]) result = layers.Conv2D(shape_x[3], (1, 1), padding='same')(y) result_bn = layers.BatchNormalization()(result) return result_bn
def assertOutputShape(self, output): """ Asserts that the shape of the tensor is consistent with the stack's output shape. For e.g. the output shape is o then the input-tensor should be of shape (B,o) """ # assert (self.ActualBatchSize, self._LSTM_stack.output_size) == K.int_shape(output) assert K.int_shape(output) == self.batch_output_shape
def sampling(args): z_mean, z_log_var = args batch = K.shape(z_mean)[0] dim = K.int_shape(z_mean)[1] # by default, random_normal has mean=0 and std=1.0 epsilon = K.random_normal(shape=(batch, dim)) return z_mean + K.exp(0.5 * z_log_var) * epsilon
def resnet_18(input_tensor, repetitions=(2, 2, 2, 2)): """Builds a custom ResNet like architecture. Args: input_shape: The input tensor with shape (nb_rows, nb_cols, nb_channels) num_outputs: The number of outputs at final softmax layer block_fn: The block function to use. This is either `basic_block` or `bottleneck`. The original paper used basic_block for layers < 50 repetitions: Number of repetitions of various block units. At each block unit, the number of filters are doubled and the input size is halved Returns: The keras `lip_model.Model`. """ block = input_tensor filters = 64 for i, r in enumerate(repetitions): block = _residual_block(filters=filters, repetitions=r, is_first_layer=(i == 0))(block) filters *= 2 # Last activation block = _bn_relu(block) # Classifier block block_shape = K.int_shape(block) pool2 = tf.layers.AveragePooling2D(pool_size=(block_shape[ROW_AXIS], block_shape[COL_AXIS]), strides=(1, 1))(block) flatten1 = tf.layers.Flatten()(pool2) return flatten1
def __init__(self, input): self.input = model = input aug_opts = {} if config.test_aug_times: # ------------------- With test augmentation, keep first sample the same -------- assert model.shape[0] == 1, 'Test augmentation only with bs=1' no_aug_input = model model = replicate_to_batch(model, config.test_aug_times - 1) aug_opts = { 'horizontal_flip': config.horizontal_flip, 'crop_pixels': config.crop_pixels, } no_aug_out = self.preprocess_and_augment(no_aug_input, aug_opts={}) flip_prob = 0.5 if not config.test_aug_times == 2 else 1 self.aug_out = model = self.preprocess_and_augment(model, aug_opts=aug_opts, flip_prob=flip_prob) if config.test_aug_times: self.aug_out = model = tf.concat([no_aug_out, self.aug_out], 0) # spatio-temporal frontend model = tf.contrib.keras.layers.ZeroPadding3D(padding=(2, 3, 3))(model) model = tf.layers.Conv3D(filters=64, kernel_size=(5, 7, 7), strides=[1, 2, 2], padding='valid', use_bias=False)(model) model = batch_normalization_wrapper(model) model = tf.nn.relu(model) model = tf.contrib.keras.layers.ZeroPadding3D(padding=(0, 1, 1))(model) model = tf.layers.MaxPooling3D(pool_size=(1, 3, 3), strides=(1, 2, 2))(model) # We want to apply the resnet on every timestep, so reshape into a batch of size b*t packed_model = temporal_batch_pack(model, input_shape=K.int_shape(model)[1:]) resnet = resnet_18(packed_model) self.output = temporal_batch_unpack( resnet, shape_list(model)[1], input_shape=K.int_shape(resnet)[1:])
def attention_block(x, gating, inter_shape, name): """ self gated attention, attention mechanism on spatial dimension :param x: input feature map :param gating: gate signal, feature map from the lower layer :param inter_shape: intermedium channle numer :param name: name of attention layer, for output :return: attention weighted on spatial dimension feature map """ shape_x = K.int_shape(x) shape_g = K.int_shape(gating) theta_x = layers.Conv2D(inter_shape, (2, 2), strides=(2, 2), padding='same')(x) # 16 shape_theta_x = K.int_shape(theta_x) phi_g = layers.Conv2D(inter_shape, (1, 1), padding='same')(gating) upsample_g = layers.Conv2DTranspose( inter_shape, (3, 3), strides=(shape_theta_x[1] // shape_g[1], shape_theta_x[2] // shape_g[2]), padding='same')(phi_g) # 16 # upsample_g = layers.UpSampling2D(size=(shape_theta_x[1] // shape_g[1], shape_theta_x[2] // shape_g[2]), # data_format="channels_last")(phi_g) concat_xg = layers.add([upsample_g, theta_x]) act_xg = layers.Activation('relu')(concat_xg) psi = layers.Conv2D(1, (1, 1), padding='same')(act_xg) sigmoid_xg = layers.Activation('sigmoid')(psi) shape_sigmoid = K.int_shape(sigmoid_xg) upsample_psi = layers.UpSampling2D(size=(shape_x[1] // shape_sigmoid[1], shape_x[2] // shape_sigmoid[2]), name=name + '_weight')(sigmoid_xg) # 32 upsample_psi = expend_as(upsample_psi, shape_x[3]) y = layers.multiply([upsample_psi, x]) result = layers.Conv2D(shape_x[3], (1, 1), padding='same')(y) result_bn = layers.BatchNormalization()(result) return result_bn
def spatial_attention(coder_x,up_x,K): shape_coder_x = kk.int_shape(coder_x) print('coder=',shape_coder_x) shape_up_x = kk.int_shape(up_x) theta_x = Conv2D(K*flt, (2, 2), padding='same')(coder_x) shape_theta_x = kk.int_shape(theta_x) print('theta_x=shape:',shape_theta_x) phi_g = Conv2D(K*flt, (1, 1), padding='same')(up_x) upsample_g = Conv2DTranspose(K*flt, (2, 2), strides=(2, 2))(phi_g) print('upsample_g=shape:',upsample_g.shape) concat_xg = keras.layers.add([upsample_g, theta_x]) act_xg = keras.layers.Activation('relu')(concat_xg) psi = Conv2D(1, (1, 1), padding='same')(act_xg) sigmoid_xg = keras.layers.Activation('sigmoid')(psi) shape_sigmoid = kk.int_shape(sigmoid_xg) upsample_psi = UpSampling2D(size = (shape_coder_x[1]//shape_sigmoid[1], shape_coder_x[2]//shape_sigmoid[2]))(sigmoid_xg) print('upsample_psi.shape=',upsample_psi.shape) upsample_psi = expend_as(upsample_psi, shape_coder_x[3]) #upsample_psi = tf.keras.backend.repeat_elements(upsample_psi, shape_coder_x[3], axis=3) y = multiply([upsample_psi, theta_x]) result = Conv2D(shape_coder_x[3], (1, 1), padding='same')(y) result_bn = keras.layers.BatchNormalization()(result) print('result_bn.shape:'+str(result_bn.shape)) return result_bn
def _decoder_lstm(self, Ex_t, z_t, lstm_states_t_1): """Represents invocation of the decoder lstm. (h_t, lstm_states_t) = *(z_t|Ex_t, lstm_states_t_1)""" with tf.variable_scope(self.my_scope) as var_scope: with tf.name_scope(var_scope.original_name_scope): m = self.C.m D = self.C.D B = self.C.B * self.BeamWidth if (not self.C.build_scanning_RNN) or ( not self.C.no_clock_to_lstm): inputs_t = tf.concat((Ex_t, z_t), axis=-1, name="Ex_concat_z") assert K.int_shape(inputs_t) == (B, m + D) else: assert self.C.build_scanning_RNN, 'no_clock_to_lstm can be set only in a scanning RNN ' inputs_t = z_t assert K.int_shape(inputs_t) == (B, D) self._LSTM_stack.assertStateShape(lstm_states_t_1) (htop_t, lstm_states_t) = self._LSTM_stack(inputs_t, lstm_states_t_1) return (htop_t, lstm_states_t)
def _get_seed_input(self, seed_input): """Creates a random `seed_input` if None. Otherwise: - Ensures batch_size dim on provided `seed_input`. - Shuffle axis according to expected `image_data_format`. """ desired_shape = (1, ) + K.int_shape(self.input_tensor)[1:] if seed_input is None: return utils.random_array(desired_shape, mean=np.mean(self.input_range), std=0.05 * (self.input_range[1] - self.input_range[0])) # Add batch dim if needed. if len(seed_input.shape) != len(desired_shape): seed_input = np.expand_dims(seed_input, 0) # Only possible if channel idx is out of place. if seed_input.shape != desired_shape: seed_input = np.moveaxis(seed_input, -1, 1) return seed_input.astype(K.floatx())
def __init__(self, config, context, beamsearch_width=1, var_scope=None): assert K.int_shape(context) == (config.B, config.L, config.D) with tf.variable_scope(var_scope or 'CALSTM') as scope: with tf.name_scope(scope.original_name_scope): super(CALSTM, self).__init__(_scope=scope, name=scope.name) self.my_scope = scope self.C = CALSTMParams(config) ## Beam Width to be supplied to BeamsearchDecoder. It essentially broadcasts/tiles a ## batch of input from size B to B * BeamWidth. Set this value to 1 in the training ## phase. self._beamsearch_width = beamsearch_width self._a = context ## Image features from the Conv-Net assert self._a.get_shape().as_list() == [ self.C.B, self.C.L, self.C.D ] self._LSTM_stack = tfc.RNNWrapper( self.C.decoder_lstm, beamsearch_width=beamsearch_width)
def get_img_shape(img): """Returns image shape in a backend agnostic manner. Args: img: An image tensor of shape: `(channels, image_dims...)` if data_format='channels_first' or `(image_dims..., channels)` if data_format='channels_last'. Returns: Tuple containing image shape information in `(samples, channels, image_dims...)` order. """ if isinstance(img, np.ndarray): shape = img.shape else: shape = K.int_shape(img) if K.image_data_format() == 'channels_last': shape = list(shape) shape.insert(1, shape[-1]) shape = tuple(shape[:-1]) return shape
def shortcut(self, input, residual): """Adds a shortcut between input and residual block and merges them with "sum" """ # Expand channels of shortcut to match residual. # Stride appropriately to match residual (width, height) # Should be int if network architecture is correctly configured. input_shape = K.int_shape(input) #residual_shape = K.int_shape(residual) try: residual_shape = np.shape(residual).as_list() except: residual_shape = np.shape(residual) stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS])) stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS])) equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS] #equal_width = input_shape[ROW_AXIS] == residual_shape[ROW_AXIS] #equal_heights = input_shape[COL_AXIS] == residual_shape[COL_AXIS] shortcut = input # 1 X 1 conv if shape is different. Else identity. if stride_width > 1 or stride_height > 1 or not equal_channels: #if not equal_width or not equal_height or not equal_channels: shortcut = layers.Conv2D(filters=residual_shape[CHANNEL_AXIS], kernel_size=(1, 1), strides=(stride_width, stride_height), padding="valid", kernel_initializer="he_normal", kernel_regularizer=regularizers.l2(0.0001))(input) return layers.add([shortcut, residual])
def call(self, inputs, state): """ Builds/threads tf graph for one RNN iteration. Takes in previous lstm states (h and c), the current input and the image annotations (a) as input and outputs the states and outputs for the current timestep. Note that input(t) = Ey(t-1). Input(t=0) = Null. When training, the target output is used for Ey whereas at prediction time (via. beam-search for e.g.) the actual output is used. """ with tf.variable_scope(self.my_scope) as var_scope: with tf.name_scope(var_scope.original_name_scope ): ## Ugly but only way to fix TB visuals ## Input Ex_t = inputs # shape = (B,m) ## State htop_1 = state.lstm_state.h if self._LSTM_stack.num_layers == 1 else state.lstm_state[ -1].h lstm_states_t_1 = state.lstm_state # shape = ((B,n), (B,n)) = (c_t_1, h_t_1) unused_alpha_t_1 = state.alpha # shape = (B, L) unused_beta_t_1 = state.beta # (B,1) a = self._a ## Broadcast context from size B to B*BeamWidth, because that's what BeamSearchDecoder does ## to the input batch. if self.BeamWidth > 1: a = tf.contrib.seq2seq.tile_batch(self._a, self.BeamWidth) CONF = self.C B = CONF.B * self.BeamWidth m = CONF.m L = CONF.L # Kv =CONF.K assert K.int_shape(Ex_t) == (B, m) assert K.int_shape(unused_alpha_t_1) == (B, L) assert K.int_shape(unused_beta_t_1) == (B, 1) self._LSTM_stack.assertStateShape(lstm_states_t_1) ################ Attention Model ################ alpha_t, beta_t = self._attention_model( a, htop_1, Ex_t) # alpha.shape = (B, L), beta.shape = (B,) assert K.int_shape(alpha_t) == (B, L) assert K.int_shape(beta_t) == (B, 1) ################ Soft deterministic attention: z = alpha-weighted mean of a ################ with tf.variable_scope('Phi'): ## (B, L) batch_dot (B,L,D) -> (B, D) z_t = K.batch_dot(alpha_t, a, axes=[1, 1]) # z_t.shape = (B, D) z_t = tf.multiply( beta_t, z_t, name='beta_t.z' ) # elementwise multiply (B,1)*(B,D) -> (B,D) z_t = tf.identity(z_t, name='z_t') ## For tensorboard viz. ################ Decoder Layer ################ (htop_t, lstm_states_t) = self._decoder_lstm( Ex_t, z_t, lstm_states_t_1) # h_t.shape=(B,n) # ################ Output Layer ################ # with tf.variable_scope('Output_Layer'): # yLogits_t = self._output_layer(Ex_t, h_t, z_t) # yProbs_t.shape = (B,K) self._LSTM_stack.assertOutputShape(htop_t) self._LSTM_stack.assertStateShape(lstm_states_t) ## assert K.int_shape(yProbs_t) == (B, Kv) ## assert K.int_shape(yLogits_t) == (B, Kv) assert K.int_shape(alpha_t) == (B, L) assert K.int_shape(beta_t) == (B, 1) return htop_t, CALSTMState(lstm_states_t, alpha_t, z_t, beta_t)
def _attention_model(self, a, h_prev, Ex_t): with tf.variable_scope(self.my_scope) as var_scope: with tf.name_scope(var_scope.original_name_scope): with tf.variable_scope('AttentionModel'): CONF = self.C B = self.ActualBatchSize L = CONF.L D = CONF.D h = h_prev n = self.output_size m = CONF.m self.assertOutputShape(h_prev) assert K.int_shape(a) == (B, L, D) assert K.int_shape(h_prev) == (B, n) assert K.int_shape(Ex_t) == (B, m) if (CONF.att_model == 'MLP_shared') or (CONF.att_model == '1x1_conv'): """ Here we'll effectively create L MLP stacks all sharing the same weights. Each stack receives a concatenated vector of a(l) and h as input. """ # h.shape = (B,n). Expand it to (B,1,n) and then broadcast to (B,L,n) in order # to concatenate with feature vectors of 'a' whose shape=(B,L,D) h = tf.identity(K.tile(K.expand_dims(h, axis=1), (1, L, 1)), name='h_t-1') a = tf.identity(a, name='a') if CONF.feed_clock_to_att: assert CONF.build_scanning_RNN, 'Attention model can take Ex_t only in a scanning-LSTM' # Ex_t.shape = (B,m). Expand it to (B,1,m) and then broadcast to (B,L,m) in order # to concatenate with feature vectors of 'a' whose shape=(B,L,D) x = tf.identity(K.tile(K.expand_dims(Ex_t, axis=1), (1, L, 1)), name='Ex_t') # Concatenate a, h nd x. Final shape = (B, L, D+n+m) att_inp = tf.concat([a, h, x], -1, name='ai_h_x') # (B, L, D+n+m) assert K.int_shape(att_inp) == (B, L, D + n + m) else: # Concatenate a and h. Final shape = (B, L, D+n) att_inp = tf.concat([a, h], -1, name='ai_h') # (B, L, D+n) assert K.int_shape(att_inp) == (B, L, D + n) if CONF.att_model == 'MLP_shared': ## For #layers > 1 this implementation will endup being different than the paper's implementation because they only ## Below is how it is implemented in the code released by the authors of the paper ## for i in range(1, CONF.att_a_layers+1): ## if not last_layer: ## a = Dense(CONF['att_a_%d_n'%(i,)], activation=tanh)(a) ## else: # last-layer ## a = AffineTransform(CONF['att_a_%d_n'%(i,)])(a) ## h = AffineTransform(CONF['att_h_%d_n'%(i,)])(h) ## ah = a + K.expand_dims(h, axis=1) ## ah = tanh(ah) ## alpha = Dense(softmax_layer_params, activation=softmax)(ah) alpha_1_ = tfc.MLPStack(CONF.att_layers)( att_inp) # (B, L, 1) assert K.int_shape(alpha_1_) == (B, L, 1 ) # (B, L, 1) alpha_ = K.squeeze(alpha_1_, axis=2) # output shape = (B, L) assert K.int_shape(alpha_) == (B, L) elif CONF.att_model == '1x1_conv': """ NOTE: The above model ('MLP_shared') tantamounts to a 1x1 convolution on the Lx1 shaped (L=H.W) convnet features with num_channels=D i.e. an input shape of (H,W,C) or (1,L,D). Using 'dimctx' kernels of size (1,1) and stride=1 resulting in an output shape of (1,L,dimctx) [or (B, L, 1, dimctx) with the batch dimension included]. This option provides such a convnet layer implementation (which turns out not to be faster than MLP_shared). """ att_inp = tf.expand_dims(att_inp, axis=1) alpha_1_ = tfc.ConvStack( CONF.att_layers, (B, 1, L, D + self.output_size))(att_inp) assert K.int_shape(alpha_1_) == (B, 1, L, 1) alpha_ = tf.squeeze(alpha_1_, axis=[1, 3]) # (B, L) assert K.int_shape(alpha_) == (B, L) elif CONF.att_model == 'MLP_full': # MLP: weights not shared across L ## concatenate a and h_prev and pass them through a MLP. This is different than the theano ## implementation of the paper because we flatten a from (B,L,D) to (B,L*D). Hence each element ## of the L*D vector receives its own weight because the effective weight matrix here would be ## shape (L*D, num_dense_units) as compared to (D, num_dense_units) as in the shared_weights case ## Concatenate a and h. Final shape will be (B, L*D+n) with tf.variable_scope('a_h'): a_ = K.batch_flatten(a) # (B, L*D) a_.set_shape( (B, L * D)) # Flatten loses shape info if CONF.build_scanning_RNN and CONF.feed_clock_to_att: assert CONF.build_scanning_RNN, 'Attention model can take Ex_t only in a scanning-LSTM' att_inp = tf.concat( [a_, h, Ex_t], -1, name="a_h_x") # (B, L*D + n + m) assert K.int_shape(att_inp) == ( B, L * D + self.output_size + m), 'shape %s != %s' % ( K.int_shape(att_inp), (B, L * D + self.output_size + m)) else: att_inp = tf.concat([a_, h], -1, name="a_h") # (B, L*D + n) assert K.int_shape(att_inp) == ( B, L * D + self.output_size), 'shape %s != %s' % ( K.int_shape(att_inp), (B, L * D + self.output_size)) alpha_ = tfc.MLPStack(CONF.att_layers)( att_inp) # (B, L) assert K.int_shape(alpha_) == (B, L) else: raise AttributeError( 'Invalid value of att_model param: %s' % CONF.att_model) ## Softmax alpha = tf.identity(tf.nn.softmax(alpha_), name='alpha') assert K.int_shape(alpha) == (B, L) ## Attention Modulator: Beta if CONF.build_att_modulator: beta = tfc.MLPStack(CONF.att_modulator, self.batch_output_shape)(h_prev) beta = tf.identity(beta, name='beta') else: beta = tf.constant(1., shape=(B, 1), dtype=CONF.dtype) assert K.int_shape(beta) == (B, 1) return alpha, beta
x = tf.keras.layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding='same')(input_img) x = tf.keras.layers.MaxPooling2D(pool_size=2)(x) x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same')(x) x = tf.keras.layers.MaxPooling2D(pool_size=2)(x) x = tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', padding='same')(x) x = tf.keras.layers.MaxPooling2D(pool_size=2)(x) shape = K.int_shape(x) x = tf.keras.layers.Flatten()(x) x = tf.keras.layers.Dense(16)(x) z_mean = tf.keras.layers.Dense(latent_dim)(x) z_log_var = tf.keras.layers.Dense(latent_dim)(x) z = tf.keras.layers.Lambda(sampling, output_shape=(latent_dim, ), name="z")([z_mean, z_log_var]) encoder = tf.keras.models.Model(input_img, [z_mean, z_log_var, z], name="encoder") encoder.summary() latent_inputs = tf.keras.layers.Input(shape=(latent_dim, ), name='z_sampling') x = tf.keras.layers.Dense(shape[1] * shape[2] * shape[3], activation='relu')(latent_inputs)