def test_basic_ops_value(self): np.random.seed(12082518) x = K.variable(np.random.randn(8, 8)) y = K.variable(np.random.randn(8, 8)) z = K.variable(np.random.randint(0, 2, size=(8, 8)), dtype=np.bool) w = K.variable(np.random.randint(0, 2, size=(8, 8)), dtype=np.bool) self.assertEqual(round(np.sum(K.eval(K.relu(x, alpha=0.12))) * 10000), 276733) self.assertEqual(round(np.sum(K.eval(K.elu(x, alpha=0.12))) * 10000), 289202) self.assertEqual(np.sum(K.eval(K.softmax(x))), 8.0) self.assertEqual(round(np.sum(K.eval(K.softplus(x))) * 10000), 554564) self.assertEqual(round(np.sum(K.eval(K.softsign(x))) * 100000), 211582) self.assertEqual(round(np.sum(K.eval(K.sigmoid(x))) * 10000), 330427) self.assertEqual(round(np.sum(K.eval(K.hard_sigmoid(x))) * 10000), 330836) self.assertEqual(round(np.sum(K.eval(K.tanh(x))) * 100000), 290165) self.assertEqual(round(np.sum(K.eval(K.square(x))) * 10000), 744492) self.assertEqual(round(np.sum(K.eval(K.sqrt(x))) * 10000), 300212) self.assertEqual(round(np.sum(K.eval(K.abs(x))) * 10000), 559979) self.assertEqual(np.sum(K.eval(K.sign(x))), 6.0) self.assertEqual(round(np.sum(K.eval(K.inv(x))) * 1000), 495838) self.assertEqual(round(np.sum(K.eval(K.exp(x))) * 1000), 122062) self.assertEqual(round(np.sum(K.eval(K.log(K.abs(x)))) * 10000), -344491) self.assertEqual(np.sum(K.eval(K.round(x))), 5.0) self.assertEqual(round(np.sum(K.eval(K.pow(x, 8))) * 100), 398153) self.assertEqual( round(np.sum(K.eval(K.clip(x, -0.12, 0.12))) * 1000000), 620529) # TODO: pygpu (libgpuarray) still not support diag # self.assertEqual(round(np.sum(K.eval(K.diag(x))) * 100000), 325289) self.assertEqual(np.sum(K.eval(K.eye(12, 8))), 8.0) self.assertEqual(np.sum(K.eval(K.eq(z, w))), 38) self.assertEqual(np.sum(K.eval(K.neq(z, w))), 26) self.assertEqual(np.sum(K.eval(K.gt(x, y))), 33) self.assertEqual(np.sum(K.eval(K.ge(x, y))), 33) self.assertEqual(np.sum(K.eval(K.lt(x, y))), 31) self.assertEqual(np.sum(K.eval(K.le(x, y))), 31) self.assertEqual(round(np.sum(K.eval(K.switch(z, x, y))) * 100000), 139884)
def score(self, query, key=None, scale=1, window_width=None, q_proj=None, target_proj=None): r""" Arguments: query: Query (or target sequence) tensor of shape `[batch_size, Tq, dim]` or `[num_heads, batch_size, Tq, dim]` in case of multi-heads attention. key: Key (or source sequence) tensor of shape `[batch_size, Tv, dim]` or `[num_heads, batch_size, Tv, dim]` in case of multi-heads attention. scale: single `Scalar` or `Tensor` of shape `[dim]` for scaling the attention scores, suggested `1/sqrt(dim)` in (Vaswani et al. 2017). window_width : `None`, `Integer` or `Float` ([0, 1]). The total number of frames for a single window in local attention (i.e. `left + 1 + right`) Can be given as a fixed number of frames (`int`), or percentage of the sequence length (`float`). If `None`, use `Tq` q_proj : `Dense`, instance of dense or fully connected layer - for `ScoreLocation`, the number of hidden unit is `1` - for `ScoreGeneral`, the number of hidden unit is `dim` target_proj : `Dense`, for predictive local attention, applying a fully connected network on target sequence (i.e. the query) to predict the position on source sequence (i.e. the key). The layer must has output dimension equal to 1 and return logit value. Returns: Tensor of shape `[num_heads, batch_size, Tq, Tv]`, or `[num_heads, batch_size, Tq, 1]` if `ScoreLocation` """ ### Check if multi-head attention is used num_heads = _get_num_heads(query) if num_heads > 0: query = bk.reshape(query, [-1] + [i for i in query.shape[2:]]) if key is not None: key = bk.reshape(key, [-1] + [i for i in key.shape[2:]]) Tq = query.shape[1] Tv = Tq if key is None else key.shape[1] # scale shape is `[]` or `[dim]` scale = bk.array(scale, dtype=query.dtype) ### Check the window width if window_width is None: window_width = Tq elif window_width < 1: window_width = window_width * Tv window_width = int(window_width) ### Locative attention if AttentionMechanism.ScoreLocation in self: if PosLocalM in self or PosLocalP in self: raise NotImplementedError( "ScoreLocation only support Global attention, but given: %s" % str(self)) # [batch_size * num_heads, Tq, dim] scores = bk.reduce_mean(scale) * q_proj(query) assert scores.shape[-1] == 1, \ " q_proj must have only 1 hidden unit, but given %d" % scores.shape[-1] ### Other score mode need the key tensor else: if key is None: raise ValueError( "key must be provided for attention type: %s" % str(self)) ### Attention position (local or global) if PosLocalM in self: key = key[:, -window_width:] elif PosLocalP in self: pt = bk.sigmoid(target_proj(bk.reshape(query, ([0], -1)))) assert pt.shape[-1] == 1, \ "target_proj must project the query [., Tq * dim] to [., 1], i.e. " + \ "predicting the attention position on source sequence using " + \ "knowledge from target sequence." pt = Tv * pt # `[batch_size * num_heads, 1]` # `[batch_size * num_heads, Tv]` # Eq (10) (Luong et al. 2015) gauss_est = bk.exp( -bk.square(bk.arange(Tv, dtype=pt.dtype) - pt) / (2 * bk.square(window_width / 2))) # `[batch_size * num_heads, 1, Tv]` gauss_est = bk.expand_dims(gauss_est, axis=1) ### Additive or concat method if AttentionMechanism.ScoreAdditive in self: # [batch_size * num_heads, Tq, 1, dim] q = bk.expand_dims(query, axis=2) # [batch_size * num_heads, 1, Tv, dim] k = bk.expand_dims(key, axis=1) # [batch_size * num_heads, Tq, Tv] scores = bk.reduce_sum(scale * bk.tanh(q + k), axis=-1) ### Dot product or multiplicative scoring elif AttentionMechanism.ScoreDotProd in self: # this is a trick to make attention_scale broadcastable when # scale_tied=False scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2)) ### cosine scoring elif AttentionMechanism.ScoreCosine in self: # [batch_size * num_heads, Tq, 1, dim] q = bk.expand_dims(query, axis=2) # [batch_size * num_heads, 1, Tv, dim] k = bk.expand_dims(key, axis=1) # [batch_size * num_heads, Tq, Tv, dim] scores = (q * k) / (bk.norm(q, p=2) * bk.norm(k, p=2)) scores = bk.reduce_sum(scale * scores, axis=-1, keepdims=False) ### general method with only project on the query elif AttentionMechanism.ScoreGeneral in self: query = q_proj(query) assert query.shape[-1] == key.shape[-1], \ " q_proj must have %d hidden units, but given %d units" % \ (key.shape[-1], query.shape[-1]) scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2)) else: raise NotImplementedError( "No support for attention_type='%s'" % str(self)) ### applying the local-predictive attention if PosLocalP in self: scores = scores * gauss_est ### get back the multi-heads shape if num_heads > 0: scores = bk.reshape(scores, shape=[num_heads, -1] + [i for i in scores.shape[1:]]) return scores
def convolutional_vae(X, saved_states, **kwargs): """ convolutional_vae Return ------ [y_encoder, y_decoder] States ------ [f_inference (encoder), f_generative (decoder)] """ n = kwargs.get('n', 10) batch_size = K.get_shape(X)[0] if batch_size is None: raise ValueError("You must specify batch_size dimension for the input placeholder.") # ====== init ====== # if saved_states is None: # Encoder f_inference = N.Sequence([ N.Reshape(shape=(-1, 28, 28, 1)), N.Conv(num_filters=32, filter_size=3, strides=1, pad='valid', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.Conv(num_filters=64, filter_size=5, strides=2, pad='same', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.Dropout(level=0.1), N.Flatten(outdim=2), N.Dense(num_units=n * 2, b_init=None), N.BatchNorm(axes=0) ], debug=True, name='Encoder') # Decoder f_generative = N.Sequence([ N.Dimshuffle(pattern=(0, 'x', 'x', 1)), N.TransposeConv(num_filters=64, filter_size=3, strides=1, pad='valid', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.TransposeConv(num_filters=32, filter_size=5, strides=2, pad='same', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.TransposeConv(num_filters=1, filter_size=13, strides=3, pad='valid', b_init=None), N.BatchNorm(activation=K.linear), N.Flatten(outdim=3) ], debug=True, name="Decoder") else: f_inference, f_generative = saved_states # ====== Perfrom ====== # # Encoder y_encoder = f_inference(K.cast(X, 'float32')) mu = y_encoder[:, :n] sigma = K.softplus(y_encoder[:, n:]) qz = Normal(mu=mu, sigma=sigma, name='Normal_qz') # Decoder z = Normal(mu=K.zeros(shape=(batch_size, n)), sigma=K.ones(shape=(batch_size, n)), name="Normal_pz") logits = f_generative(z) X_reconstruct = Bernoulli(logits=logits) # inference params = f_inference.parameters + f_generative.parameters inference = ed.KLqp(latent_vars={z: qz}, data={X_reconstruct: X}) # ====== get cost for training ====== # # Bind p(x, z) and q(z | x) to the same placeholder for x. if K.is_training(): import tensorflow as tf inference.initialize() if True: optimizer = tf.train.AdamOptimizer(0.01, epsilon=1.0) updates = optimizer.apply_gradients( optimizer.compute_gradients(inference.loss, var_list=params)) init = tf.global_variables_initializer() init.run() f_train = K.function(X, inference.loss, updates) else: optimizer = tf.train.AdamOptimizer(0.01, epsilon=1.0) inference.initialize(optimizer=optimizer, var_list=params) init = tf.global_variables_initializer() init.run() f_train = lambda x: inference.update(feed_dict={X: x})['loss'] samples = K.sigmoid(logits) return (samples, z, qz), (f_inference, f_generative)
def convolutional_vae(X, saved_states, **kwargs): """ convolutional_vae Return ------ [y_encoder, y_decoder] States ------ [f_inference (encoder), f_generative (decoder)] """ n = kwargs.get('n', 10) batch_size = K.get_shape(X)[0] if batch_size is None: raise ValueError( "You must specify batch_size dimension for the input placeholder.") # ====== init ====== # if saved_states is None: # Encoder f_inference = N.Sequence([ N.Reshape(shape=(-1, 28, 28, 1)), N.Conv(num_filters=32, filter_size=3, strides=1, pad='valid', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.Conv(num_filters=64, filter_size=5, strides=2, pad='same', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.Dropout(level=0.1), N.Flatten(outdim=2), N.Dense(num_units=n * 2, b_init=None), N.BatchNorm(axes=0) ], debug=True, name='Encoder') # Decoder f_generative = N.Sequence([ N.Dimshuffle(pattern=(0, 'x', 'x', 1)), N.TransposeConv(num_filters=64, filter_size=3, strides=1, pad='valid', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.TransposeConv(num_filters=32, filter_size=5, strides=2, pad='same', b_init=init_ops.constant_initializer(0.), activation=K.elu), N.TransposeConv(num_filters=1, filter_size=13, strides=3, pad='valid', b_init=None), N.BatchNorm(activation=K.linear), N.Flatten(outdim=3) ], debug=True, name="Decoder") else: f_inference, f_generative = saved_states # ====== Perfrom ====== # # Encoder y_encoder = f_inference(K.cast(X, 'float32')) mu = y_encoder[:, :n] sigma = K.softplus(y_encoder[:, n:]) qz = Normal(mu=mu, sigma=sigma, name='Normal_qz') # Decoder z = Normal(mu=K.zeros(shape=(batch_size, n)), sigma=K.ones(shape=(batch_size, n)), name="Normal_pz") logits = f_generative(z) X_reconstruct = Bernoulli(logits=logits) # inference params = f_inference.parameters + f_generative.parameters inference = ed.KLqp(latent_vars={z: qz}, data={X_reconstruct: X}) # ====== get cost for training ====== # # Bind p(x, z) and q(z | x) to the same placeholder for x. if K.is_training(): import tensorflow as tf inference.initialize() if True: optimizer = tf.train.AdamOptimizer(0.01, epsilon=1.0) updates = optimizer.apply_gradients( optimizer.compute_gradients(inference.loss, var_list=params)) init = tf.global_variables_initializer() init.run() f_train = K.function(X, inference.loss, updates) else: optimizer = tf.train.AdamOptimizer(0.01, epsilon=1.0) inference.initialize(optimizer=optimizer, var_list=params) init = tf.global_variables_initializer() init.run() f_train = lambda x: inference.update(feed_dict={X: x})['loss'] samples = K.sigmoid(logits) return (samples, z, qz), (f_inference, f_generative)