def call(self, x): if 0. < self.prob < 1.: self.layer.kernel = K.in_train_phase( K.dropout(self.layer.kernel, self.prob), self.layer.kernel) self.layer.bias = K.in_train_phase( K.dropout(self.layer.bias, self.prob), self.layer.bias) return self.layer.call(x)
def call(self, x, mask=None): if 0. < self.prob < 1.: self.kernel = K.in_train_phase(K.dropout(self.kernel, self.prob), self.kernel) self.b = K.in_train_phase(K.dropout(self.b, self.prob), self.b) # Same as original output = K.dot(x, self.W) if self.bias: output += self.b return self.activation(output)
def SR_model(num_classes, dropout, mc_dropout, input_dim, training, pooling='avg'): inputs = Input(input_dim) base_model = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=inputs) base_model.trainable = True x = base_model.output x = Dropout(dropout, name='top_dropout_1')(x, training=training) if pooling == 'avg': x = GlobalAveragePooling2D(name='avg_pool')(x) elif pooling == 'max': x = GlobalMaxPooling2D(name='max_pool')(x) x = Dropout(dropout, name='top_dropout_2')(x, training=training) x = Dense(512, activation='relu', name='dense_512')(x) x = BatchNormalization()(x) x = Dropout(dropout, name='top_dropout_3')(x, training=training) x = Lambda(lambda x: K.dropout(x, level=mc_dropout))(x) #classification head (f) sr = Dense(num_classes, activation='softmax', name='dense_f')(x) return Model(inputs=inputs, outputs=sr)
def _time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None): '''Apply y.w + b for every temporal slice y of x. ''' if not input_dim: # won't work with TensorFlow input_dim = K.shape(x)[2] if not timesteps: # won't work with TensorFlow timesteps = K.shape(x)[1] if not output_dim: # won't work with TensorFlow output_dim = K.shape(w)[1] if dropout: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x *= expanded_dropout_matrix # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b: x = x + b # reshape to 3D tensor x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, x): # Input is a 3-D or 4-D Tensor ndim = K.ndim(x) if ndim == 4: dims = K.int_shape(x) x = K.reshape(x, (-1, dims[1] * dims[2], 1, self.D)) elif ndim != 3: raise ValueError( 'Encoding input should have shape BxNxD or BxHxWxD') # Residual vectors R = x - self.codes ''' OLD WAY _x_i = K.repeat_elements(x, self.K, 1) _c_k = K.tile(self.codes, (n, 1)) R = K.reshape(_x_i - _c_k, (-1, n, self.K, self.D)) ''' # Assignment weights, optional dropout if self.dropout_rate is not None: W_ik = K.softmax( scaledL2(R, K.dropout(self.scale, self.dropout_rate))) else: W_ik = K.softmax(scaledL2(R, self.scale)) # Aggregation E = tf.einsum('bik,bikd->bkd', W_ik, R) # Normalize encoding vectors if self.l2_normalize: E = tf.nn.l2_normalize(E, axis=-1) E = tf.layers.Flatten()(E) return E
def encoder(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') masks = K.equal(inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._encoder_stack): # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [encodings, encodings, encodings, masks] attention_out = attention(attention_input) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) return encodings, masks
def build_model(self): # Build the network of vgg for 10 classes with massive dropout and weight decay as described in the paper. weight_decay = self.weight_decay basic_dropout_rate = 0.3 model_input = Input(shape=self.x_shape) curr = Dense( 512, kernel_regularizer=regularizers.l2(weight_decay))(model_input) curr = Activation('relu')(curr) curr = BatchNormalization()(curr) curr = Dropout(basic_dropout_rate + 0.2)(curr) curr = Lambda(lambda x: K.dropout(x, level=self.mc_dropout_rate))(curr) # classification head (f) curr1 = Dense(self.num_classes, activation='softmax')(curr) # selection head (g) curr2 = Dense(512, kernel_regularizer=regularizers.l2(weight_decay))(curr) curr2 = Activation('relu')(curr2) curr2 = BatchNormalization()(curr2) # this normalization is identical to initialization of batchnorm gamma to 1/10 curr2 = Lambda(lambda x: x / 10)(curr2) curr2 = Dense(1, activation='sigmoid')(curr2) # auxiliary head (h) selective_output = Concatenate(axis=1, name="selective_head")([curr1, curr2]) auxiliary_output = Dense(self.num_classes, activation='softmax', name="classification_head")(curr) model = Model(inputs=model_input, outputs=[selective_output, auxiliary_output]) return model
def call(self, inputs): if self._masking: assert len( inputs ) == 4, "inputs should be set [queries, keys, values, masks]." queries, keys, values, masks = inputs else: assert len( inputs) == 3, "inputs should be set [queries, keys, values]." queries, keys, values = inputs if K.dtype(queries) != 'float32': queries = K.cast(queries, 'float32') if K.dtype(keys) != 'float32': keys = K.cast(keys, 'float32') if K.dtype(values) != 'float32': values = K.cast(values, 'float32') matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul scaled_matmul = matmul / int(queries.shape[-1])**0.5 # Scale if self._masking: scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.) if self._future: scaled_matmul = self.future_mask(scaled_matmul) softmax_out = K.softmax(scaled_matmul) # SoftMax # Dropout out = K.dropout(softmax_out, self._dropout_rate) outputs = K.batch_dot(out, values) return outputs
def call(self, inputs): values = inputs values_linear = K.dot(values, self._weights_values) # Dropout out = K.dropout(values_linear, self._dropout_rate) return out
def call(self, inputs): if random.random() > 0.5: kernel = B.dropout(self.kernel, 0.5) * random.uniform(-1, 1) else: kernel = self.kernel outputs = B.dot(inputs, kernel) return self.activation(outputs)
def encoder(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') masks = K.equal(inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, inputs) embeddings *= self._model_dim ** 0.5 # Scale # Position Encodings position_encodings = self.EncoderPositionEncoding(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._encoder_stack): # Multi-head-Attention attention = self.EncoderMultiHeadAttetions[i] attention_input = [encodings, encodings, encodings, masks] attention_out = attention(attention_input) # Add & Norm attention_out += encodings attention_out = self.EncoderLayerNorms0[i](attention_out) # Feed-Forward ff = self.EncoderPositionWiseFeedForwards[i] ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = self.EncoderLayerNorms1[i](ff_out) return encodings, masks
def call(self, inputs, **kwargs): main_input, embedding_matrix = inputs input_shape_tensor = K.shape(main_input) last_input_dim = K.int_shape(main_input)[-1] emb_input_dim, emb_output_dim = K.int_shape(embedding_matrix) projected = K.dot(K.reshape(main_input, (-1, last_input_dim)), self.projection) if self.add_biases: projected = K.bias_add(projected, self.biases, data_format='channels_last') if 0 < self.projection_dropout < 1: projected = K.in_train_phase( lambda: K.dropout(projected, self.projection_dropout), projected, training=kwargs.get('training')) attention = K.dot(projected, K.transpose(embedding_matrix)) if self.scaled_attention: # scaled dot-product attention, described in # "Attention is all you need" (https://arxiv.org/abs/1706.03762) sqrt_d = K.constant(math.sqrt(emb_output_dim), dtype=K.floatx()) attention = attention / sqrt_d result = K.reshape( self.activation(attention), (input_shape_tensor[0], input_shape_tensor[1], emb_input_dim)) return result
def call(self, inputs): outputs = K.concatenate(inputs, axis=1) for i in range(self._n_layers): outputs = K.dot(outputs, self.weights[i]) outputs = self._activation(outputs) outputs = K.dropout(outputs, self._dropout_rate) outputs = K.dot(outputs, self.output_weight) return outputs
def call(self, inputs): if 0. < self.rate < 1.: noise_shape = self._get_noise_shape(inputs) outputs = K.dropout(inputs, self.rate, noise_shape, seed=self.seed) else: outputs = inputs return outputs
def __init__(self, dropout=0.2, mc_dropout=0.2, num_classes=1, training=True, input_dim=(224, 224, 3), pooling="avg"): self.c = 0.75 self.lamda = 32 self.alpha = 0.5 self.dropout = dropout self.mc_dropout = mc_dropout self.pooling = pooling self.input_dim = input_dim self.training = training self.num_classes = num_classes #create model inputs = Input(shape=self.input_dim) base_model = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=inputs) base_model.trainable = True x = base_model.output x = Dropout(self.dropout, name='top_dropout_1')(x, training=self.training) if pooling == 'avg': x = GlobalAveragePooling2D(name='avg_pool')(x) elif pooling == 'max': x = GlobalMaxPooling2D(name='max_pool')(x) x = Dropout(self.dropout, name='top_dropout_2')(x, training=self.training) x = Dense(512, activation='relu', name='dense_512')(x) x = BatchNormalization()(x) x = Dropout(self.mc_dropout, name='top_dropout_3')(x, training=self.training) x = Lambda(lambda x: K.dropout(x, level=self.mc_dropout))(x) #classification head (f) f = Dense(self.num_classes, activation='softmax', name='f_head')(x) #selection head (g) g = Dense(512, activation='relu', name='dense_512_g')(x) g = BatchNormalization()(g) # this normalization is identical to initialization of batchnorm gamma to 1/10 g = Lambda(lambda a: a / 10)(g) g = Dense(1, activation='sigmoid', name='g_head')(g) # auxiliary head (h) selective_output = Concatenate(axis=1, name="selective_head")([f, g]) auxillary_output = Dense(self.num_classes, activation='softmax', name='auxilary_head')(x) self.model = Model(inputs=inputs, outputs=[selective_output, auxillary_output])
def decoder(self, inputs): decoder_inputs, encoder_encodings, encoder_masks = inputs if K.dtype(decoder_inputs) != 'int32': decoder_inputs = K.cast(decoder_inputs, 'int32') decoder_masks = K.equal(decoder_inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, decoder_inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._decoder_stack): # Masked-Multi-head-Attention masked_attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) masked_attention_input = [ encodings, encodings, encodings, decoder_masks ] masked_attention_out = masked_attention(masked_attention_input) # Add & Norm masked_attention_out += encodings masked_attention_out = LayerNormalization()(masked_attention_out) # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [ masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks ] attention_out = attention(attention_input) # Add & Norm attention_out += masked_attention_out attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) # Pre-Softmax 与 Embeddings 共享参数 linear_projection = K.dot(encodings, K.transpose(self.embeddings)) outputs = K.softmax(linear_projection) return outputs
def call(self, inputs, **kwargs): categorical_inputs, numerical_inputs = inputs outputs = K.concatenate(categorical_inputs + numerical_inputs, axis=-1) for i in range(self._n_layers): outputs = K.dot(outputs, self._kernel_weights[i]) outputs = self._activation(outputs) outputs = K.in_train_phase( K.dropout(outputs, self._dropout_rate), outputs, ) outputs = K.dot(outputs, self._output_weight) return outputs
def call(self, inputs): # queries: [None, n, k] # keys: [None, n, k] # values: [None, n, k] queries, keys, values = inputs score = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # [None, n, n] score = score / int(queries.shape[-1])**0.5 # 缩放 score = K.softmax(score) # SoftMax score = K.dropout(score, self._dropout) # dropout outputs = K.batch_dot(score, values) # [None, n, k] return outputs
def dot_product_attention(self, x, mask=None, dropout=0.1, training=None): q, k, v = x logits = tf.matmul(q, k, transpose_b=True) # [bs, 8, len, len] if self.bias: logits += self.b if mask is not None: # [bs, len] mask = tf.expand_dims(mask, axis=1) mask = tf.expand_dims(mask, axis=1) # [bs,1,1,len] logits = self.mask_logits(logits, mask) weights = tf.nn.softmax(logits, name="attention_weights") weights = K.in_train_phase(K.dropout(weights, dropout), weights, training=training) x = tf.matmul(weights, v) return x
def _time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): """Apply `y . w + b` for every temporal slice y of x. # Arguments x: input tensor. w: weight matrix. b: optional bias vector. dropout: wether to apply dropout (same dropout mask for every temporal slice of the input). input_dim: integer; optional dimensionality of the input. output_dim: integer; optional dimensionality of the output. timesteps: integer; optional number of timesteps. training: training phase tensor or boolean. # Returns Output tensor. """ if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.int_shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) # reshape to 3D tensor if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def decoder(self, inputs): decoder_inputs, encoder_encodings, encoder_masks = inputs if K.dtype(decoder_inputs) != 'int32': decoder_inputs = K.cast(decoder_inputs, 'int32') decoder_masks = K.equal(decoder_inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, decoder_inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = self.DecoderPositionEncoding(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._decoder_stack): # Masked-Multi-head-Attention masked_attention = self.DecoderMultiHeadAttetions0[i] masked_attention_input = [ encodings, encodings, encodings, decoder_masks ] masked_attention_out = masked_attention(masked_attention_input) # Add & Norm masked_attention_out += encodings masked_attention_out = self.DecoderLayerNorms0[i]( masked_attention_out) # Multi-head-Attention attention = self.DecoderMultiHeadAttetions1[i] attention_input = [ masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks ] attention_out = attention(attention_input) # Add & Norm attention_out += masked_attention_out attention_out = self.DecoderLayerNorms1[i](attention_out) # Feed-Forward ff = self.DecoderPositionWiseFeedForwards[i] ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = self.DecoderLayerNorms2[i](ff_out) # Pre-Softmax 与 Embeddings 共享参数 linear_projection = K.dot(encodings, K.transpose(self.embeddings)) outputs = K.softmax(linear_projection) return outputs
def build(self, input_shape): input_dim = input_shape[-1] self.kernel = self.add_weight(shape=(input_dim, self.units * 4 + self.levels * 2), name='kernel', initializer='glorot_uniform') self.recurrent_kernel = self.add_weight( shape=(self.units, self.units * 4 + self.levels * 2), name='recurrent_kernel', initializer='orthogonal') self.bias = self.add_weight(shape=(self.units * 4 + self.levels * 2, ), name='bias', initializer='zeros') self.built = True if self.dropconnect: self._kernel = K.dropout(self.kernel, self.dropconnect) self._kernel = K.in_train_phase(self._kernel, self.kernel) self._recurrent_kernel = K.dropout(self.recurrent_kernel, self.dropconnect) self._recurrent_kernel = K.in_train_phase(self._recurrent_kernel, self.recurrent_kernel) else: self._kernel = self.kernel self._recurrent_kernel = self.recurrent_kernel
def call(self, x, training=None): sel = [xx[:, :, 0:1] for xx in x] #the first enty of every input is the selector sel_tensor = K.concatenate(sel) sel_drop = K.dropout( sel_tensor, self.dropout) #drop out of selector before softmax self.sel_drop_softmax = K.softmax( K.in_train_phase(sel_drop, sel_tensor, training=training)) oo = [ x[i][:, :, 1:] * self.sel_drop_softmax[:, :, i:i + 1] for i in range(len(x)) ] return [tf.add_n(oo), self.sel_drop_softmax ] # you don't need to explicitly define the custom gradient
def call(self, inputs, **kwargs): if self.masking: assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]" queries, keys, values, masks = inputs else: assert len(inputs) == 3, "inputs should be set [queries, keys, values]" queries, keys, values = inputs if K.dtype(queries) != 'float32': queries = K.cast(queries, 'float32') if K.dtype(keys) != 'float32': keys = K.cast(keys, 'float32') if K.dtype(values) != 'float32': values = K.cast(values, 'float32') # (batch_size*n_heads, max_len, head_dim) # (batch_size*n_heads, head_dim, max_len) # (batch_size*n_heads, max_len, max_len) matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul scaled_matmul = matmul / int(queries.shape[-1]) ** 0.5 # Scale if self.masking: scaled_matmul = self.mask(scaled_matmul, masks) if self.future: scaled_matmul = self.future_mask(scaled_matmul) softmax_out = K.softmax(scaled_matmul) # SoftMax # TODO: 这里的dropout是做什么的 # Dropout out = K.dropout(softmax_out, self.dropout_rate) # TODO: batch_dot的实际意义 outputs = K.batch_dot(out, values) return outputs
def call(self, inputs): """ Q: [h * batch, q_size, d_model] K: [h * batch, k_size, d_model] V: [h * batch, k_size, d_model] mask?: [h * batch, q_size, k_size] returns: - output: [h * batch, q_size, d_model] - attention weights: [h * batch, q_size, k_size] """ Q, K, V = inputs[0], inputs[1], inputs[2] if self.use_mask: mask = inputs[3] out = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # [h * batch, q_size, k_size] out = out / np.sqrt(self.d_k) if self.use_mask: # wherever mask is zero, replace value in tensor by -1e9 out = tf.multiply(out, mask) + tf.multiply((1.0 - mask), -1e9) p_attn = tf.nn.softmax(out, name="attention_weights") # https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L136 # TODO: figure out why `tf.cond` isn't used for implementing the `Dropout` layer. # NOTE: tf.cond seems to work without any visible difference, see the 2.0 branch. out = tf.contrib.framework.smart_cond( Backend.learning_phase(), lambda: Backend.dropout(p_attn, self.dropout), lambda: tf.identity(p_attn)) out = tf.matmul(p_attn, V) # [h * batch, q_size, d_model] return [out, p_attn]
def call(self, x, mask=None): if 0. < self.rate < 1.: noise_shape = self._get_noise_shape(x) x = K.dropout(x, self.rate, noise_shape) return x
def dropped_inputs(): return K.dropout(ones, self.recurrent_dropout)
def dropped_inputs(): return K.dropout(ones, self.dropout)
def x_prime(): return K.dropout(x, p)
def dropped_inputs(): return K.dropout(h, self.dropout_rate, K.shape(h))