def call(self, x, mask=None): x = K.permute_dimensions(x, (0, 2, 1)) x = K.expand_dims(x, -1) output = K.square(K.permute_dimensions(K.squeeze(K.conv2d(x, self.kernel), -1), (0, 2, 1))) return output
def call(self, x): assert(K.backend() == 'tensorflow') temp = K.permute_dimensions(x, (0, 2, 1)) for i in range(0, self.attention_depth): temp = K.sigmoid(K.dot(temp, self.Ws[i]) + self.bs[i]) temp = K.permute_dimensions(temp, (0, 2, 1)) estimated_weight = K.squeeze(K.dot(temp, K.expand_dims(self.Wf, -1)), -1) biased_weight = estimated_weight + self.bias non_linear_weight = K.tanh(biased_weight) # For each hidded state calculate how much should it contribute # to the context vector. This is the main part of attention. # In order to convert weights to "probabilities" use a sigmoid # based function: exp(x) / sum(exp(xi)). prob = K.exp(non_linear_weight) # Compute the total sum for each batch. total_sum = K.sum(prob, axis=1, keepdims=True) prob /= K.cast(total_sum, K.floatx()) # Enable this if you want access to internal probabilities. # Should only be used for testing that Attention works as expected. # return prob # Multiply each hidden value by the corresponding probability. prob = K.expand_dims(prob, -1) new_hidden_values = x * prob return K.sum(new_hidden_values, axis=1)
def get_output(self, train=False): H = self.get_input(train) X = K.permute_dimensions(H, (1, 0, 2))[-1] def reshape(x, states): h = K.dot(x, self.W_h) + self.b_h return h, [] _, H, _ = K.rnn(reshape, H, [], mask=None) if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states else: initial_states = self.get_initial_states(X) [outputs,hidden_states, cell_states], updates = theano.scan( self._step, n_steps = self.output_length, outputs_info=[X] + initial_states, non_sequences=[H, self.U_i, self.U_f, self.U_o, self.U_c, self.W_i, self.W_f, self.W_c, self.W_o, self.W_x, self.W_a, self.V_i, self.V_f, self.V_c, self.V_o, self.b_i, self.b_f, self.b_c, self.b_o, self.b_x, self.b_a]) states = [hidden_states[-1], cell_states[-1]] if self.stateful and not self.state_input: self.updates = [] for i in range(2): self.updates.append((self.states[i], states[i])) for o in self.state_outputs: o.updates = [] for i in range(2): o.updates.append((o.states[i], states[i])) return K.permute_dimensions(outputs, (1, 0, 2))
def attend_function(self, inputs, mask=None): # b,n,f -> b,f via b,n broadcasted inputs = K.permute_dimensions(inputs, (1,0,2)) ### assuming it comes from an unroller if mask: mask = K.permute_dimensions(mask, (1,0,2)) output = super(Accumulator, self).call(inputs, mask) return output
def call(self, x, mask=None): if self.direction == 'Down': X = K.permute_dimensions(x, (0, 3, 1, 2)) elif self.direction == 'Right': X = K.permute_dimensions(x, (0, 2, 1, 3)) else: raise Exception('ERROR: Unknown direction') if self.stateful: super(DiagLSTM, self).call(X, mask) else: if self.reverse: X = X[:,::-1,:,:] X = Utils.Skew(X) res = super(DiagLSTM, self).call(X, mask) unskew = Utils.Unskew(res) if self.reverse: unskew = unskew[:,::-1,:,:] if self.direction == 'Down': return K.permute_dimensions(unskew, (0, 2, 3, 1)) elif self.direction == 'Right': return K.permute_dimensions(unskew, (0, 2, 1, 3)) else: raise Exception('ERROR: Unknown direction')
def call(self, X, mask=None): # 1D -> 2D batch = K.shape(X)[0] width = deconv_output_length(K.shape(X)[1], self.filter_length, self.padding, self.strides[2]) print("Output width: ", width) print("Input shape: ", K.shape(X)) X = K.expand_dims(X,2) print("Input shape after expand: ", K.shape(X)) # X = K.permute_dimensions(X, (0, 2, 3, 1)) X = K.permute_dimensions(X, (0, 2, 1, 3)) print("Input shape after permute: ", K.shape(X)) deconv_shape = tf.pack([batch, 1, width, self.nb_filter]) print("Deconv shape: ", deconv_shape) conv_out = tf.nn.conv2d_transpose(X, self.W, strides=self.strides, padding=self.padding.upper(), output_shape=deconv_shape) output = conv_out + K.reshape(self.b, (1, 1, 1, self.W_shape[2])) print("Output shape: ", K.shape(output)) # output = K.permute_dimensions(output, (0, 3, 1, 2)) output = K.permute_dimensions(output, (0, 2, 1, 3)) print("Output shape after permute: ", K.shape(output)) # 2D -> 1D output = K.squeeze(output,2) print("Output shape after squeeze: ", K.shape(output)) return output
def _step(self, x_tm1, h_tm1, c_tm1, H, u_i, u_f, u_o, u_c, w_i, w_f, w_c, w_o, w_x, w_a, v_i, v_f, v_c, v_o, b_i, b_f, b_c, b_o, b_x, b_a): s_tm1 = K.repeat(c_tm1, self.input_length) e = H + s_tm1 def a(x, states): output = K.dot(x, w_a) + b_a return output, [] _, energy, _ = K.rnn(a, e, [], mask=None) energy = activations.get('linear')(energy) energy = K.permute_dimensions(energy, (2, 0, 1)) energy = energy[0] alpha = K.softmax(energy) alpha = K.repeat(alpha, self.hidden_dim) alpha = K.permute_dimensions(alpha, (0, 2 , 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) xi_t = K.dot(x_tm1, w_i) + K.dot(v, v_i) + b_i xf_t = K.dot(x_tm1, w_f) + K.dot(v, v_f) + b_f xc_t = K.dot(x_tm1, w_c) + K.dot(v, v_c) + b_c xo_t = K.dot(x_tm1, w_o) + K.dot(v, v_o) + b_o i_t = self.inner_activation(xi_t + K.dot(h_tm1, u_i)) f_t = self.inner_activation(xf_t + K.dot(h_tm1, u_f)) c_t = f_t * c_tm1 + i_t * self.activation(xc_t + K.dot(h_tm1, u_c)) o_t = self.inner_activation(xo_t + K.dot(h_tm1, u_o)) h_t = o_t * self.activation(c_t) x_t = K.dot(h_t, w_x) + b_x return x_t, h_t, c_t
def call(self, inputs): input_shape = K.int_shape(inputs) if len(input_shape) != 4: raise ValueError('Inputs should have rank ' + str(4) + '; Received input shape:', str(input_shape)) if self.data_format == 'channels_first': batch_size, c, h, w = input_shape if batch_size is None: batch_size = -1 rh, rw = self.size oh, ow = h * rh, w * rw oc = c // (rh * rw) out = K.reshape(inputs, (batch_size, rh, rw, oc, h, w)) out = K.permute_dimensions(out, (0, 3, 4, 1, 5, 2)) out = K.reshape(out, (batch_size, oc, oh, ow)) return out elif self.data_format == 'channels_last': batch_size, h, w, c = input_shape if batch_size is None: batch_size = -1 rh, rw = self.size oh, ow = h * rh, w * rw oc = c // (rh * rw) out = K.reshape(inputs, (batch_size, h, w, rh, rw, oc)) out = K.permute_dimensions(out, (0, 1, 3, 2, 4, 5)) out = K.reshape(out, (batch_size, oh, ow, oc)) return out
def call(self, x, mask=None): x = K.permute_dimensions(x, (0, 2, 1)) x = K.reshape(x, (-1, self.input_length)) x = K.expand_dims(x, 1) x = K.expand_dims(x, -1) if self.real_filts is not None: conv_out_r = K.conv2d(x, self.W_r, strides=self.subsample, border_mode=self.border_mode, dim_ordering='th') else: conv_out_r = x if self.complex_filts is not None: conv_out_c1 = K.conv2d(x, self.W_c1, strides=self.subsample, border_mode=self.border_mode, dim_ordering='th') conv_out_c2 = K.conv2d(x, self.W_c2, strides=self.subsample, border_mode=self.border_mode, dim_ordering='th') conv_out_c = K.sqrt(K.square(conv_out_c1) + K.square(conv_out_c2) + K.epsilon()) output = K.concatenate((conv_out_r, conv_out_c), axis=1) else: output = conv_out_r output_shape = self.get_output_shape_for((None, self.input_length, self.input_dim)) output = K.squeeze(output, 3) # remove the dummy 3rd dimension output = K.permute_dimensions(output, (2, 1, 0)) output = K.reshape(output, (-1, output_shape[1], output.shape[1]*output.shape[2])) return output
def reverse(x): if K.ndim(x) == 2: x = K.expand_dims(x, -1) rev = K.permute_dimensions(x, (1, 0, 2))[::-1] rev = K.squeeze(rev, -1) else: rev = K.permute_dimensions(x, (1, 0, 2))[::-1] return K.permute_dimensions(rev, (1, 0, 2))
def semantic_matrix(argv): assert len(argv) == 2 q = argv[0] a = argv[1] q_sqrt = K.sqrt((q ** 2).sum(axis=2, keepdims=True)) a_sqrt = K.sqrt((a ** 2).sum(axis=2, keepdims=True)) denominator = K.batch_dot(q_sqrt, K.permute_dimensions(a_sqrt, [0,2,1])) return K.batch_dot(q, K.permute_dimensions(a, [0,2,1])) / (denominator + SAFE_EPSILON)
def call(self, X, mask=None): #X = self.get_input(train) X = K.permute_dimensions(X, (0, 2, 3, 1)) conv_out = tf.nn.conv2d_transpose(X, self.W, strides=self.strides, padding=self.padding.upper(), output_shape=self.deconv_shape) output = conv_out + K.reshape(self.b, (1, 1, 1, self.W_shape[2])) return K.permute_dimensions(output, (0, 3, 1, 2))
def call(self, x, mask=None): stride_row, stride_col = self.subsample _, feature_dim, nb_filter = self.W_shape if self.dim_ordering == 'th': if K._backend == 'theano': output = [] for i in range(self.output_row): for j in range(self.output_col): slice_row = slice(i * stride_row, i * stride_row + self.nb_row) slice_col = slice(j * stride_col, j * stride_col + self.nb_col) x_flatten = K.reshape(x[:, :, slice_row, slice_col], (1, -1, feature_dim)) output.append(K.dot(x_flatten, self.W[i * self.output_col + j, :, :])) output = K.concatenate(output, axis=0) else: xs = [] for i in range(self.output_row): for j in range(self.output_col): slice_row = slice(i * stride_row, i * stride_row + self.nb_row) slice_col = slice(j * stride_col, j * stride_col + self.nb_col) xs.append(K.reshape(x[:, :, slice_row, slice_col], (1, -1, feature_dim))) x_aggregate = K.concatenate(xs, axis=0) output = K.batch_dot(x_aggregate, self.W) output = K.reshape(output, (self.output_row, self.output_col, -1, nb_filter)) output = K.permute_dimensions(output, (2, 3, 0, 1)) elif self.dim_ordering == 'tf': xs = [] for i in range(self.output_row): for j in range(self.output_col): slice_row = slice(i * stride_row, i * stride_row + self.nb_row) slice_col = slice(j * stride_col, j * stride_col + self.nb_col) xs.append(K.reshape(x[:, slice_row, slice_col, :], (1, -1, feature_dim))) x_aggregate = K.concatenate(xs, axis=0) output = K.batch_dot(x_aggregate, self.W) output = K.reshape(output, (self.output_row, self.output_col, -1, nb_filter)) output = K.permute_dimensions(output, (2, 0, 1, 3)) else: raise Exception('Invalid dim_ordering: ' + self.dim_ordering) if self.bias: if self.dim_ordering == 'th': output += K.reshape(self.b, (1, nb_filter, self.output_row, self.output_col)) elif self.dim_ordering == 'tf': output += K.reshape(self.b, (1, self.output_row, self.output_col, nb_filter)) else: raise Exception('Invalid dim_ordering: ' + self.dim_ordering) output = self.activation(output) return output
def f(X): b, ch, r, c = X.shape # batch, channel, row, column half = n // 2 square = K.square(X) extra_channels = K.spatial_2d_padding(K.permute_dimensions(square, (0, 2, 3, 1)), (0, half)) extra_channels = K.permute_dimensions(extra_channels, (0, 3, 1, 2)) scale = k for i in range(n): scale += alpha * extra_channels[:, i:i + ch, :, :] scale = scale ** beta return X / scale
def Skew(inputs): inputs_ = K.permute_dimensions(inputs, (3,0,1,2)) buffer_ = T.zeros((K.shape(inputs)[3], K.shape(inputs)[0], K.shape(inputs)[1]+K.shape(inputs)[3]-1, K.shape(inputs)[2])) def fnc(buf, inp, i): return T.set_subtensor(buf[:, i:i+K.shape(inputs)[1], :], inp[:,:,:]) res, _ = theano.scan(fn=fnc, sequences=[buffer_, inputs_, T.arange(K.shape(inputs)[3])]) res = K.permute_dimensions(res, (1,2,3,0)) return res
def call(self, x, mask=None): if isinstance(x, list): x,_ = x if mask is not None and isinstance(mask, list): mask,_ = mask if 0. < self.dropout < 1.: retain_p = 1. - self.dropout dims = self.W._keras_shape[:-1] B = K.random_binomial(dims, p=retain_p) * (1. / retain_p) B = K.expand_dims(B) W = K.in_train_phase(self.W * B, self.W) else: W = self.W if self.mode == 'matrix': return K.gather(W,x) elif self.mode == 'tensor': # quick and dirty: only allowing for 3dim inputs when it's tensor mode assert K.ndim(x) == 3 # put sequence on first; gather; take diagonal across shared batch dimension # in other words, W is (B, S, F) # incoming x is (B, S, A) inds = K.arange(self.W._keras_shape[0]) #out = K.gather(K.permute_dimensions(W, (1,0,2)), x).diagonal(axis1=0, axis2=3) #return K.permute_dimensions(out, (3,0,1,2)) ### method above doesn't do grads =.= # tensor abc goes to bac, indexed onto with xyz, goes to xyzac, # x == a, so shape to xayzc == xxyzc # take diagonal on first two: xyzc #out = K.colgather() out = K.gather(K.permute_dimensions(W, (1,0,2)), x) out = K.permute_dimensions(out, (0,3,1,2,4)) out = K.gather(out, (inds, inds)) return out else: raise Exception('sanity check. should not be here.') #all_dims = T.arange(len(self.W._keras_shape)) #first_shuffle = [all_dims[self.embed_dim]] + all_dims[:self.embed_dim] + all_dims[self.embed_dim+1:] ## 1. take diagonal from 0th to ## chang eof tactics ## embed on time or embed on batch. that's all I'm supporting. ## if it's embed on time, then, x.ndim+1 is where batch will be, and is what ## i need to take the diagonal over. ## now dim shuffle the xdims + 1 to the front. #todo: get second shuffle or maybe find diagonal calculations #out = K.gather(W, x) #return out ### reference #A = S(np.arange(60).reshape(3,4,5)) #x = S(np.random.randint(0, 4, (3,4,10))) #x_emb = A.dimshuffle(1,0,2)[x].dimshuffle(0,3,1,2,4)[T.arange(A.shape[0]), T.arange(A.shape[0])]
def call(self, x, mask=None): x = K.permute_dimensions(x, (0, 2, 1)) x = K.expand_dims(x, -1) conv_out = K.permute_dimensions(K.squeeze(K.conv2d(x, self.kernel), -1), (0, 2, 1)) conv_out_s = conv_out[:,:,:self.nb_simple] conv_out_c = K.square(conv_out[:,:,self.nb_simple:]) output = K.concatenate((conv_out_s, conv_out_c), axis=-1) return output
def get_initial_states(self, x): M = K.zeros_like(x[:, 0, 0]) # (nb_samples,) M = K.pack([M] * self.nb_slots) # (nb_slots, nb_samples) M = K.pack([M] * self.memory_size) # (memory_size, nb_slots, nb_samples) M = K.permute_dimensions(M, (2, 1, 0)) # (nb_samples, nb_slots, memory_size) h = K.zeros_like(x[:, 0, 0]) # (nb_samples,) h = K.pack([h] * self.memory_size) # (memory_size, nb_samples) h = K.permute_dimensions(h, (1, 0)) # (nb_samples, memory_size) w = K.zeros_like(x[:, 0, 0]) # (nb_samples,) w = K.pack([w] * self.nb_slots) # (nb_slots, nb_samples) w = K.permute_dimensions(w, (1, 0)) # (nb_samples, nb_slots) states = [M, h, w] return states
def call(self, X, mask=None): # 1D -> 2D X = K.expand_dims(X,2) X = K.permute_dimensions(X, (0, 2, 3, 1)) conv_out = tf.nn.conv2d_transpose(X, self.W, strides=self.strides, padding=self.padding.upper(), output_shape=self.deconv_shape) output = conv_out + K.reshape(self.b, (1, 1, 1, self.W_shape[2])) output = K.permute_dimensions(output, (0, 3, 1, 2)) # 2D -> 1D output = K.squeeze(output,2) return output
def get_output(self, train=False): X = train X = K.expand_dims(X, -1) # add a dimension of the right X = K.permute_dimensions(X, (0, 2, 1, 3)) conv_out = K.conv2d(X, self.W, strides=self.subsample, border_mode=self.border_mode, dim_ordering='th') output = conv_out + K.reshape(self.b, (1, self.nb_filter, 1, 1)) output = self.activation(output) output = K.squeeze(output, 3) # remove the dummy 3rd dimension output = K.permute_dimensions(output, (0, 2, 1)) return output
def call(self, x, mask=None): if self.direction == 'Down': X = K.permute_dimensions(x, (0, 2, 1, 3)) elif self.direction == 'Right': X = K.permute_dimensions(x, (0, 3, 1, 2)) else: raise Exception('ERROR: Unknown direction') if self.direction == 'Down': return K.permute_dimensions(super(PyramidSTM, self).call(X, mask), (0, 2, 1, 3)) elif self.direction == 'Right': return K.permute_dimensions(super(PyramidSTM, self).call(X, mask), (0, 2, 3, 1)) else: raise Exception('ERROR: Unknown direction')
def recurrence(y_i, h): h_permute = K.permute_dimensions(h, [0, 2, 1]) # (batch_size, encoding_dim, input_length) e = K.l2_normalize( K.batch_dot(h_permute, s, axes=1), # (batch_size, input_length) axis=1) # (batch_size, input_length) # eqn 6 alpha = K.softmax(e) # (batch_size, input_length) # eqn 5 c = K.batch_dot(h, alpha, axes=1) # (batch_size, encoding_dim) recurrence_result = K.expand_dims( K.concatenate([c, y_i], axis=1), dim=1) # (batch_size, 1, 2 * encoding_dim) expanded_h = Input(shape=(1, 2 * encoding_dim), name='expanded_h') gru = Sequential([ GRU(output_dim, return_sequences=False, input_shape=(1, 2 * encoding_dim)) ]) model = Model(input=[expanded_h], output=[gru(expanded_h)]) # (batch_size, 1, output_dim) return model(recurrence_result)
def get_output(self, train=False): v = self.get_input(train) if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states else: initial_states = self.get_initial_states(v) [outputs,hidden_states, cell_states], updates = theano.scan( self._step, n_steps = self.output_length, outputs_info=[v] + initial_states, non_sequences=[v, self.U_i, self.U_f, self.U_o, self.U_c, self.W_i, self.W_f, self.W_c, self.W_o, self.W_x, self.V_i, self.V_f, self.V_c, self.V_o, self.b_i, self.b_f, self.b_c, self.b_o, self.b_x]) states = [hidden_states[-1], cell_states[-1]] if self.stateful and not self.state_input: self.updates = [] for i in range(2): self.updates.append((self.states[i], states[i])) for o in self.state_outputs: o.updates = [] for i in range(2): o.updates.append((o.states[i], states[i])) return K.permute_dimensions(outputs, (1, 0, 2))
def gram_matrix(x): #change height,width,depth to depth, height, width, it could be 2,1,0 too #maybe 2,0,1 is more efficient due to underlying memory layout features = K.permute_dimensions(x, (2,0,1)) #batch flatten make features become 2D array features = K.batch_flatten(features) return K.dot(features, K.transpose(features)) / x.get_shape().num_elements()
def teacher_forced(h, states): # switching from (batch_size, previous_layer_input|true_input, output_dim) # to ( previous_layer_input|true_input, batch_size, output_dim) axes = [1, 0] + list(range(2, K.ndim(h))) h = K.permute_dimensions(h, axes) prev_layer_input = h[0:1, :, :] true_input = h[1:, :, :self.units] # this should correspond to true input prev_sampled_output = true_input if self.implementation == 0: x_z = prev_layer_input[0, :, :self.units] x_r = prev_layer_input[0, :, self.units: 2 * self.units] x_h = prev_layer_input[0, :, 2 * self.units:] else: raise ValueError('Implementation type ' + self.implementation + ' is invalid') z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h) + K.dot(r * prev_sampled_output, self.recurrent_kernel_y)) output = z * h_tm1 + (1. - z) * hh return K.stack([output, output])
def make_patches_grid(x, patch_size, patch_stride): '''Break image `x` up into a grid of patches. input shape: (channels, rows, cols) output shape: (rows, cols, channels, patch_rows, patch_cols) ''' from theano.tensor.nnet.neighbours import images2neibs # TODO: all K, no T x = K.expand_dims(x, 0) xs = K.shape(x) num_rows = 1 + (xs[-2] - patch_size) // patch_stride num_cols = 1 + (xs[-1] - patch_size) // patch_stride num_channels = xs[-3] patches = images2neibs( x, (patch_size, patch_size), (patch_stride, patch_stride), mode='valid') # neibs are sorted per-channel patches = K.reshape(patches, (num_channels, K.shape(patches)[0] // num_channels, patch_size, patch_size)) patches = K.permute_dimensions(patches, (1, 0, 2, 3)) # arrange in a 2d-grid (rows, cols, channels, px, py) patches = K.reshape( patches, (num_rows, num_cols, num_channels, patch_size, patch_size)) patches_norm = K.sqrt( K.sum(K.square(patches), axis=(2, 3, 4), keepdims=True)) return patches, patches_norm
def gram_matrix(x): if K.image_dim_ordering() == "th": features = K.batch_flatten(x) else: features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) gram = K.dot(features, K.transpose(features)) return gram
def call(self, inputs, **kwargs): """Following the routing algorithm from Hinton's paper, but replace b = b + <u,v> with b = <u,v>. This change can improve the feature representation of the capsule. However, you can replace b = K.batch_dot(outputs, hat_inputs, [2, 3]) with b += K.batch_dot(outputs, hat_inputs, [2, 3]) to get standard routing. """ if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) print(self.routings) for i in range(self.routings): c = K.softmax(b, 1) o = self.activation(K.batch_dot(c, hat_inputs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(o, hat_inputs, [2, 3]) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def call(self, x, mask=None): print("AttentionDecoder.call") H = x x = K.permute_dimensions(H, (1, 0, 2))[-1, :, :] if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states[:] else: initial_states = self.get_initial_states(H) constants = self.get_constants(H) + [H] y_0 = x x = K.repeat(x, self.output_length) initial_states += [y_0] last_output, outputs, states = K.rnn( self.step, x, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=self.output_length) if self.stateful and not self.state_input: self.updates = zip(self.states, states) self.states_to_transfer = states return outputs
def free_running(h, states): prev_generated_output = initial_states[0][1:, :, :] prev_sampled_output = prev_generated_output # switching from (batch_size, previous_layer_input|true_input, output_dim) # to ( previous_layer_input|true_input, batch_size, output_dim) axes = [1, 0] + list(range(2, K.ndim(h))) h = K.permute_dimensions(h, axes) prev_layer_input = h[0:1, :, :] if self.implementation == 0: x_z = prev_layer_input[0, :, :self.units] x_r = prev_layer_input[0, :, self.units: 2 * self.units] x_h = prev_layer_input[0, :, 2 * self.units:] z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h) + K.dot(r * prev_sampled_output, self.recurrent_kernel_y)) output = z * h_tm1 + (1. - z) * hh final_output = self.output_sampling(output, random_cutoff_vec) return K.stack([output, final_output])
def _outer(AB): att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1))) return K.permute_dimensions(att_ji, (0, 2, 1))
def call(self, x, mask=None): assert (len(x) == 2) img = x[0] rois = x[1] input_shape = K.shape(img) outputs = [] for roi_idx in range(self.num_rois): x = rois[0, roi_idx, 0] y = rois[0, roi_idx, 1] w = rois[0, roi_idx, 2] h = rois[0, roi_idx, 3] row_length = w / float(self.pool_size) col_length = h / float(self.pool_size) num_pool_regions = self.pool_size #NOTE: the RoiPooling implementation differs between theano and tensorflow due to the lack of a resize op # in theano. The theano implementation is much less efficient and leads to long compile times if self.dim_ordering == 'channels_first': for jy in range(num_pool_regions): for ix in range(num_pool_regions): x1 = x + ix * row_length x2 = x1 + row_length y1 = y + jy * col_length y2 = y1 + col_length x1 = K.cast(x1, 'int32') x2 = K.cast(x2, 'int32') y1 = K.cast(y1, 'int32') y2 = K.cast(y2, 'int32') x2 = x1 + K.maximum(1, x2 - x1) y2 = y1 + K.maximum(1, y2 - y1) new_shape = [ input_shape[0], input_shape[1], y2 - y1, x2 - x1 ] x_crop = img[:, :, y1:y2, x1:x2] xm = K.reshape(x_crop, new_shape) pooled_val = K.max(xm, axis=(2, 3)) outputs.append(pooled_val) elif self.dim_ordering == 'channels_last': x = K.cast(x, 'int32') y = K.cast(y, 'int32') w = K.cast(w, 'int32') h = K.cast(h, 'int32') rs = tf.image.resize(img[:, y:y + h, x:x + w, :], size=(self.pool_size, self.pool_size)) outputs.append(rs) final_output = K.concatenate(outputs, axis=0) final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels)) if self.dim_ordering == 'channels_first': final_output = K.permute_dimensions(final_output, (0, 1, 4, 2, 3)) else: final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4)) return final_output
def local_conv3d(self, inputs, kernel, kernel_size, strides, output_shape, data_format=None): """Apply 3D conv with un-shared weights. # Arguments inputs: 4D tensor with shape: (batch_size, filters, new_rows, new_cols) if data_format='channels_first' or 4D tensor with shape: (batch_size, new_rows, new_cols, filters) if data_format='channels_last'. kernel: the unshared weight for convolution, with shape (output_items, feature_dim, filters) kernel_size: a tuple of 2 integers, specifying the width and height of the 3D convolution window. strides: a tuple of 2 integers, specifying the strides of the convolution along the width and height. output_shape: a tuple with (output_row, output_col) data_format: the data format, channels_first or channels_last # Returns A 4d tensor with shape: (batch_size, filters, new_rows, new_cols) if data_format='channels_first' or 4D tensor with shape: (batch_size, new_rows, new_cols, filters) if data_format='channels_last'. # Raises ValueError: if `data_format` is neither `channels_last` or `channels_first`. """ if data_format is None: data_format = K.image_data_format() if data_format not in {'channels_first', 'channels_last'}: raise ValueError('Unknown data_format: ' + str(data_format)) stride_row, stride_col, stride_z = strides output_row, output_col, output_z = output_shape kernel_shape = K.int_shape(kernel) _, feature_dim, filters = kernel_shape xs = [] for i in range(output_row): for j in range(output_col): for k in range(output_z): slice_row = slice(i * stride_row, i * stride_row + kernel_size[0]) slice_col = slice(j * stride_col, j * stride_col + kernel_size[1]) slice_z = slice(k * stride_z, k * stride_z + kernel_size[2]) if data_format == 'channels_first': xs.append(K.reshape(inputs[:, :, slice_row, slice_col, slice_z], (1, -1, feature_dim))) else: xs.append(K.reshape(inputs[:, slice_row, slice_col, slice_z, :], (1, -1, feature_dim))) x_aggregate = K.concatenate(xs, axis=0) output = K.batch_dot(x_aggregate, kernel) output = K.reshape(output, (output_row, output_col, output_z, -1, filters)) if data_format == 'channels_first': output = K.permute_dimensions(output, (3, 4, 0, 1, 2)) else: output = K.permute_dimensions(output, (3, 0, 1, 2, 4)) return output
def minibatch_discriminator(x): """ Computes minibatch discrimination features from input tensor x""" diffs = K.expand_dims(x, 3) - \ K.expand_dims(K.permute_dimensions(x, [1, 2, 0]), 0) l1_norm = K.sum(K.abs(diffs), axis=2) return K.sum(K.exp(-l1_norm), axis=2)
def call(self, inputs): X = inputs[0] # Node features (batch x N x F) A = inputs[1] # Adjacency matrix (batch x N x N) assert K.ndim(X) == 3 assert K.ndim(A) == 3 outputs = [] for h in range(self.attn_heads): kernel = self.kernels[h] attn_kernel_self = self.attn_kernels_self[h] attn_kernel_neighs = self.attn_kernels_neighs[h] if self.use_bias: bias = self.biases[h] # Compute inputs to attention network features = K.dot(X, kernel) # (batch x N x F') # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] # broadcast the attention kernel across all batches and nodes attn_for_self = K.dot(features, attn_kernel_self) attn_for_neighs = K.dot(features, attn_kernel_neighs) # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] trans_attn_for_neighs = K.permute_dimensions( attn_for_neighs, (0, 2, 1)) # add dimensions to compute additive attention with broadcasting scores = attn_for_self + trans_attn_for_neighs # (batch x N x N) via broadcasting # Add nonlinearty scores = LeakyReLU(alpha=0.2)(scores) # Mask values before activation (Vaswani et al., 2017) mask = (1.0 - A) * -10e9 scores = scores + mask # Feed masked values to softmax attn_weights = K.softmax( scores) # (batch x N x N), attention coefficients dropout_attn_coeffs = Dropout(self.attn_dropout)( attn_weights) # (batch x N x N) dropout_features = Dropout(self.feature_dropout)(features) # Linear combination with neighbors' features # (batch x N x N) * (batch x N x F') = (batch x N x F') node_features = K.batch_dot(dropout_attn_coeffs, dropout_features) if self.use_bias: node_features = K.bias_add(node_features, bias) outputs.append(node_features) # Reduce the attention heads output according to the reduction method if self.attn_heads_reduction == 'concat': output = K.concatenate(outputs, -1) # (batch x N x KF') else: output = K.mean(K.stack(outputs, axis=0), axis=0) # (batch x N x F') output = self.activation(output) return output
def gram_matrix(x): features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) return K.dot(features, K.transpose(features)) / x.get_shape().num_elements()
def call(self, inputs): channel_axis = 1 if self.data_format == 'channels_first' else -1 input_dim = K.shape(inputs)[channel_axis] // 2 if self.rank == 1: f_real = self.kernel[:, :, :self.filters] f_imag = self.kernel[:, :, self.filters:] elif self.rank == 2: f_real = self.kernel[:, :, :, :self.filters] f_imag = self.kernel[:, :, :, self.filters:] elif self.rank == 3: f_real = self.kernel[:, :, :, :, :self.filters] f_imag = self.kernel[:, :, :, :, self.filters:] convArgs = {"strides": self.strides[0] if self.rank == 1 else self.strides, "padding": self.padding, "data_format": self.data_format, "dilation_rate": self.dilation_rate[0] if self.rank == 1 else self.dilation_rate} if self.transposed: convFunc = {1: K.conv1d_transpose, 2: K.conv2d_transpose, 3: K.conv3d_transpose}[self.rank] else: convFunc = {1: K.conv1d, 2: K.conv2d, 3: K.conv3d}[self.rank] # processing if the weights are assumed to be represented in the # spectral domain if self.spectral_parametrization: if self.rank == 1: f_real = K.permute_dimensions(f_real, (2,1,0)) f_imag = K.permute_dimensions(f_imag, (2,1,0)) f = K.concatenate([f_real, f_imag], axis=0) fshape = K.shape(f) f = K.reshape(f, (fshape[0] * fshape[1], fshape[2])) f = ifft(f) f = K.reshape(f, fshape) f_real = f[:fshape[0]//2] f_imag = f[fshape[0]//2:] f_real = K.permute_dimensions(f_real, (2,1,0)) f_imag = K.permute_dimensions(f_imag, (2,1,0)) elif self.rank == 2: f_real = K.permute_dimensions(f_real, (3,2,0,1)) f_imag = K.permute_dimensions(f_imag, (3,2,0,1)) f = K.concatenate([f_real, f_imag], axis=0) fshape = K.shape(f) f = K.reshape(f, (fshape[0] * fshape[1], fshape[2], fshape[3])) f = ifft2(f) f = K.reshape(f, fshape) f_real = f[:fshape[0]//2] f_imag = f[fshape[0]//2:] f_real = K.permute_dimensions(f_real, (2,3,1,0)) f_imag = K.permute_dimensions(f_imag, (2,3,1,0)) # In case of weight normalization, real and imaginary weights are # normalized if self.normalize_weight: ker_shape = self.kernel_shape nb_kernels = ker_shape[-2] * ker_shape[-1] kernel_shape_4_norm = (np.prod(self.kernel_size), nb_kernels) reshaped_f_real = K.reshape(f_real, kernel_shape_4_norm) reshaped_f_imag = K.reshape(f_imag, kernel_shape_4_norm) reduction_axes = list(range(2)) del reduction_axes[-1] mu_real = K.mean(reshaped_f_real, axis=reduction_axes) mu_imag = K.mean(reshaped_f_imag, axis=reduction_axes) broadcast_mu_shape = [1] * 2 broadcast_mu_shape[-1] = nb_kernels broadcast_mu_real = K.reshape(mu_real, broadcast_mu_shape) broadcast_mu_imag = K.reshape(mu_imag, broadcast_mu_shape) reshaped_f_real_centred = reshaped_f_real - broadcast_mu_real reshaped_f_imag_centred = reshaped_f_imag - broadcast_mu_imag Vrr = K.mean(reshaped_f_real_centred ** 2, axis=reduction_axes) + self.epsilon Vii = K.mean(reshaped_f_imag_centred ** 2, axis=reduction_axes) + self.epsilon Vri = K.mean(reshaped_f_real_centred * reshaped_f_imag_centred, axis=reduction_axes) + self.epsilon normalized_weight = complex_normalization( K.concatenate([reshaped_f_real, reshaped_f_imag], axis=-1), Vrr, Vii, Vri, beta = None, gamma_rr = self.gamma_rr, gamma_ri = self.gamma_ri, gamma_ii = self.gamma_ii, scale=True, center=False, axis=-1 ) normalized_real = normalized_weight[:, :nb_kernels] normalized_imag = normalized_weight[:, nb_kernels:] f_real = K.reshape(normalized_real, self.kernel_shape) f_imag = K.reshape(normalized_imag, self.kernel_shape) # Performing complex convolution f_real._keras_shape = self.kernel_shape f_imag._keras_shape = self.kernel_shape cat_kernels_4_real = K.concatenate([f_real, -f_imag], axis=-2) cat_kernels_4_imag = K.concatenate([f_imag, f_real], axis=-2) cat_kernels_4_complex = K.concatenate([cat_kernels_4_real, cat_kernels_4_imag], axis=-1) cat_kernels_4_complex._keras_shape = self.kernel_size + (2 * input_dim, 2 * self.filters) output = convFunc(inputs, cat_kernels_4_complex, **convArgs) if self.use_bias: output = K.bias_add( output, self.bias, data_format=self.data_format ) if self.activation is not None: output = self.activation(output) return output
def gram_matrix(x): features = backend.batch_flatten(backend.permute_dimensions(x, (2, 0, 1))) gram = backend.dot(features, backend.transpose(features)) return gram
def _interpolate(image, sampled_grids, output_size): image = K.permute_dimensions(image, (0, 2, 3, 1)) batch_size = K.shape(image)[0] height = K.shape(image)[1] width = K.shape(image)[2] num_channels = K.shape(image)[3] x = K.cast(K.flatten(sampled_grids[:, 0:1, :]), dtype='float32') y = K.cast(K.flatten(sampled_grids[:, 1:2, :]), dtype='float32') x = .5 * (x + 1.0) * K.cast(width, dtype='float32') y = .5 * (y + 1.0) * K.cast(height, dtype='float32') x0 = K.cast(x, 'int32') x1 = x0 + 1 y0 = K.cast(y, 'int32') y1 = y0 + 1 max_x = int(K.int_shape(image)[2] - 1) max_y = int(K.int_shape(image)[1] - 1) x0 = K.clip(x0, 0, max_x) x1 = K.clip(x1, 0, max_x) y0 = K.clip(y0, 0, max_y) y1 = K.clip(y1, 0, max_y) pixels_batch = K.arange(0, batch_size) * (height * width) pixels_batch = K.expand_dims(pixels_batch, axis=-1) flat_output_size = output_size[0] * output_size[1] base = K.repeat_elements(pixels_batch, flat_output_size, axis=1) base = K.flatten(base) # base_y0 = base + (y0 * width) base_y0 = y0 * width base_y0 = base + base_y0 # base_y1 = base + (y1 * width) base_y1 = y1 * width base_y1 = base_y1 + base indices_a = base_y0 + x0 indices_b = base_y1 + x0 indices_c = base_y0 + x1 indices_d = base_y1 + x1 flat_image = K.reshape(image, shape=(-1, num_channels)) flat_image = K.cast(flat_image, dtype='float32') pixel_values_a = K.gather(flat_image, indices_a) pixel_values_b = K.gather(flat_image, indices_b) pixel_values_c = K.gather(flat_image, indices_c) pixel_values_d = K.gather(flat_image, indices_d) x0 = K.cast(x0, 'float32') x1 = K.cast(x1, 'float32') y0 = K.cast(y0, 'float32') y1 = K.cast(y1, 'float32') area_a = K.expand_dims(((x1 - x) * (y1 - y)), 1) area_b = K.expand_dims(((x1 - x) * (y - y0)), 1) area_c = K.expand_dims(((x - x0) * (y1 - y)), 1) area_d = K.expand_dims(((x - x0) * (y - y0)), 1) values_a = area_a * pixel_values_a values_b = area_b * pixel_values_b values_c = area_c * pixel_values_c values_d = area_d * pixel_values_d return values_a + values_b + values_c + values_d
def stack_and_transpose(x): # x is a list of length T, each element is a batch_size x output_vocab_size tensor x = K.stack(x) # is now T x batch_size x output_vocab_size tensor x = K.permute_dimensions( x, pattern=(1, 0, 2)) # is now batch_size x T x output_vocab_size return x
def NN_model(args, training=True): global N_COL global N_ROW if args.model == 'densenet121': from keras.applications.densenet import DenseNet121 input_tensor = Input(shape=(N_COL, N_ROW, 3)) base_model = DenseNet121(input_shape=(N_COL, N_ROW, 3), include_top=False, weights='imagenet', input_tensor=input_tensor, pooling=None) elif args.model == 'resnet18': import resnet NOT_CARE = 1 base_model = resnet.ResnetBuilder.build_resnet_18(input_shape=(N_COL, N_ROW, 3), num_outputs=NOT_CARE, include_top=False) elif args.model == 'resnet18_2222': import resnet NOT_CARE = 1 base_model = resnet.ResnetBuilder.build_resnet_18_2222( input_shape=(N_COL, N_ROW, 3), num_outputs=NOT_CARE, include_top=False) elif args.model == 'resnet34': import resnet NOT_CARE = 1 base_model = resnet.ResnetBuilder.build_resnet_34(input_shape=(N_COL, N_ROW, 3), num_outputs=NOT_CARE, include_top=False) elif args.model == 'resnet50': import resnet NOT_CARE = 1 base_model = resnet.ResnetBuilder.build_resnet_50(input_shape=(N_COL, N_ROW, 3), num_outputs=NOT_CARE, include_top=False) elif args.model == 'resnet101': import resnet NOT_CARE = 1 base_model = resnet.ResnetBuilder.build_resnet_101( input_shape=(N_COL, N_ROW, 3), num_outputs=NOT_CARE, include_top=False) else: raise TypeError('model should be in the list of the supported model!') print('Input col: ', N_COL) print('Input row: ', N_ROW) x = base_model.output #CNN to RNN x = Lambda(lambda x: K.permute_dimensions(x, (0, 2, 1, 3)))( x) # switchaxes from [b,h,w,c] to [b,w,h,c] conv_shape = x.get_shape() # b, h,w,c resnet 18 -> (?, 16, 32, 256) print('conv_shape', conv_shape) x = Reshape(target_shape=(int(conv_shape[1]), int(conv_shape[2] * conv_shape[3])), name='reshape')(x) x = Dense(para.dense_size, activation='relu', kernel_initializer='he_normal', name='dense1')(x) #x = BatchNormalization()(x) # GRU RNN gru_1 = GRU(para.rnn_size, return_sequences=True, init='he_normal', name='gru1')(x) gru_1b = GRU(para.rnn_size, return_sequences=True, go_backwards=True, init='he_normal', name='gru1_b')(x) gru1_merged = add([gru_1, gru_1b]) gru1_merged = BatchNormalization()(gru1_merged) gru_2 = GRU(para.rnn_size, return_sequences=True, init='he_normal', name='gru2')(gru1_merged) gru_2b = GRU(para.rnn_size, return_sequences=True, go_backwards=True, init='he_normal', name='gru2_b')(gru1_merged) gru2_merged = concatenate([gru_2, gru_2b]) gru2_merged = BatchNormalization()(gru2_merged) inner = Dense(para.num_classes, kernel_initializer='he_normal', name='dense2')(gru2_merged) y_pred = Activation('softmax', name='softmax')(inner) labels = Input(name='the_labels', shape=[para.max_text_len], dtype='float32') # (None ,7) input_length = Input(name='input_length', shape=[1], dtype='int64') # (None, 1) label_length = Input(name='label_length', shape=[1], dtype='int64') # (None, 1) # Keras doesn't currently support loss funcs with extra parameters # so CTC loss is implemented in a lambda layer loss_out = Lambda(ctc_lambda_func, output_shape=(1, ), name='ctc')([y_pred, labels, input_length, label_length]) #(None, 1) if training: return Model( inputs=[base_model.input, labels, input_length, label_length], outputs=loss_out), conv_shape[1] else: return Model(inputs=[base_model.input], outputs=y_pred)
def merge_heads(x): new_x = K.permute_dimensions(x, [0, 2, 1, 3]) x_shape = shape_list(new_x) new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])] return K.reshape(new_x, new_x_shape)
def split_heads(x, n: int, k: bool = False): # B, L, C x_shape = shape_list(x) m = x_shape[-1] new_x_shape = x_shape[:-1] + [n, m // n] new_x = K.reshape(x, new_x_shape) return K.permute_dimensions(new_x, [0, 2, 3, 1] if k else [0, 2, 1, 3])
def correlation_layer(x): lbranch, rbranch = squeeze(x[0], 1), squeeze(x[1], 1) rbranch = permute_dimensions(rbranch, (0, 2, 1)) out_tensor = squeeze(batch_dot(lbranch, rbranch), 1) return out_tensor
def _call(self, features, edges): return K.batch_dot(K.permute_dimensions(edges, (0, 2, 1)), features) \ / (K.sum(edges, axis=2, keepdims=True) + K.epsilon())
def gram_matrix(x): features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) gram = K.dot(features, K.transpose(features)) return gram
def attention(self, pre_q, pre_v, pre_k, out_seq_len: int, d_model: int, training=None): """ Calculates the output of the attention once the affine transformations of the inputs are done. Here's the shapes of the arguments: :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads) :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads) :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads) :param out_seq_len: the length of the output sequence :param d_model: dimensionality of the model (by the paper) :param training: Passed by Keras. Should not be defined manually. Optional scalar tensor indicating if we're in training or inference phase. """ # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads) q = K.permute_dimensions(pre_q, [0, 2, 1, 3]) v = K.permute_dimensions(pre_v, [0, 2, 1, 3]) if self.compression_window_size is None: k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1]) else: # Memory-compressed attention described in paper # "Generating Wikipedia by Summarizing Long Sequences" # (https://arxiv.org/pdf/1801.10198.pdf) # It compresses keys and values using 1D-convolution which reduces # the size of Q * K_transposed from roughly seq_len^2 # to convoluted_seq_len^2. If we use strided convolution with # window size = 3 and stride = 3, memory requirements of such # memory-compressed attention will be 9 times smaller than # that of the original version. if self.use_masking: raise NotImplementedError( "Masked memory-compressed attention has not " "been implemented yet") k = K.permute_dimensions(pre_k, [0, 2, 1, 3]) k, v = [ K.reshape( # Step 3: Return the result to its original dimensions # (batch_size, num_heads, seq_len, d_model//heads) K.bias_add( # Step 3: ... and add bias K.conv1d( # Step 2: we "compress" K and V using strided conv K.reshape( # Step 1: we reshape K and V to # (batch + num_heads, seq_len, d_model//heads) item, (-1, K.int_shape(item)[-2], d_model // self.num_heads)), kernel, strides=self.compression_window_size, padding='valid', data_format='channels_last'), bias, data_format='channels_last'), # new shape K.concatenate( [K.shape(item)[:2], [-1, d_model // self.num_heads]])) for item, kernel, bias in ((k, self.k_conv_kernel, self.k_conv_bias), (v, self.v_conv_kernel, self.v_conv_bias)) ] k_transposed = K.permute_dimensions(k, [0, 1, 3, 2]) # shaping K into (batch_size, num_heads, d_model//heads, seq_len) # for further matrix multiplication sqrt_d = K.constant(np.sqrt(d_model // self.num_heads), dtype=K.floatx()) q_shape = K.int_shape(q) k_t_shape = K.int_shape(k_transposed) v_shape = K.int_shape(v) # before performing batch_dot all tensors are being converted to 3D # shape (batch_size * num_heads, rows, cols) to make sure batch_dot # performs identically on all backends attention_heads = K.reshape( K.batch_dot( self.apply_dropout_if_needed(K.softmax( self.mask_attention_if_needed( K.batch_dot( K.reshape(q, (-1, ) + q_shape[-2:]), K.reshape(k_transposed, (-1, ) + k_t_shape[-2:])) / sqrt_d)), training=training), K.reshape(v, (-1, ) + v_shape[-2:])), (-1, self.num_heads, q_shape[-2], v_shape[-1])) attention_heads_merged = K.reshape( K.permute_dimensions(attention_heads, [0, 2, 1, 3]), (-1, d_model)) attention_out = K.reshape( K.dot(attention_heads_merged, self.output_weights), (-1, out_seq_len, d_model)) return attention_out
def get_gru_baseline(self): lstm_qo = GRU(100, return_sequences=False) get_diag = Lambda( lambda xin: K.sum(xin * T.eye(self.max_opt_count), axis=2), output_shape=(self.max_opt_count, )) transp_out = Lambda(lambda xin: K.permute_dimensions(xin, (0, 2, 1)), output_shape=(self.max_opt_count, 100)) apply_weights = Lambda(lambda xin: (K.expand_dims(xin[ 0], axis=-1) * K.expand_dims(xin[1], axis=2)).sum(axis=1), output_shape=(100, self.max_opt_count)) tile_q = Lambda(lambda xin: K.tile(xin, (1, self.max_opt_count, 1, 1)), output_shape=(self.max_opt_count, self.max_q_length, self.word_vec_size)) exp_dims = Lambda(lambda xin: K.expand_dims(xin, 1), output_shape=(1, self.max_q_length, self.word_vec_size)) exp_layer = Lambda(lambda xin: K.exp(xin), output_shape=(self.max_sent_para, self.max_opt_count)) mask_weights = Lambda(lambda xin: T.switch(T.eq(xin, 0), np.NINF, xin), output_shape=(self.max_sent_para, self.max_opt_count)) final_weights = Lambda(lambda xin: xin / K.cast( K.sum(xin, axis=1, keepdims=True), K.floatx()), output_shape=(self.max_sent_para, self.max_opt_count)) q_input = Input(shape=(self.max_q_length, self.word_vec_size), name='question_input') q_exp = exp_dims(q_input) q_rep = tile_q(q_exp) option_input = Input(shape=(self.max_opt_count, self.max_option_length, self.word_vec_size), name='option_input') opt_q = Concatenate(axis=2)([q_rep, option_input]) lstm_input = Input(shape=(None, self.word_vec_size), name='lstm_input') lstm_mask = Masking(mask_value=0.)(lstm_input) lstm_out = lstm_qo(lstm_mask) lstm_model = Model(inputs=lstm_input, outputs=lstm_out) lstm_td_opt = TimeDistributed(lstm_model)(opt_q) doc_input = Input(shape=(self.max_sent_para, self.max_words_sent, self.word_vec_size), name='doc_input') lstm_doc = TimeDistributed(lstm_model)(doc_input) att_wts = Dot(axes=2, normalize=True)([lstm_doc, lstm_td_opt]) att_wts = mask_weights(att_wts) att_wts = exp_layer(att_wts) att_wts = final_weights(att_wts) out = apply_weights([lstm_doc, att_wts]) out = transp_out(out) dp = Dot(axes=2, normalize=True)([out, lstm_td_opt]) out = get_diag(dp) probs = MaskedSoftmax()([out, option_input]) main_model = Model(inputs=[q_input, doc_input, option_input], outputs=probs) sgd = SGD(lr=0.1, decay=0., momentum=0., nesterov=False) main_model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) main_model.summary() return main_model
conv_4 = Conv1D(300, 4, padding='same', activation='relu', strides=1)(input) shared = Model(input, conv_4) input_1 = Input(shape=(None, 300), dtype='float32') input_2 = Input(shape=(None, 300), dtype='float32') out_1 = shared(input_1) out_2 = shared(input_2) attention = AttentionLayer()([out_1,out_2]) # out_1 column wise att_1 = GlobalMaxPooling1D()(attention) att_1 = Activation('softmax')(att_1) out_1 = dot([att_1, out_1], axes=1) # out_2 row wise attention_transposed = Lambda(lambda x: K.permute_dimensions(x, (0,2,1)))(attention) att_2 = GlobalMaxPooling1D()(attention_transposed) att_2 = Activation('softmax')(att_2) out_2 = dot([att_2, out_2], axes=1) distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([out_1, out_2]) model = Model(input=[input_1, input_2], output=distance)
def call(self, inputs): num_axis = K.ndim(inputs) inputs = K.permute_dimensions(inputs, range(num_axis)[::-1]) x_outs = K.gather(inputs, self.idxs) x_outs = K.permute_dimensions(x_outs, range(num_axis)[::-1]) return x_outs
def _call(self, inputs, **kwargs): if self.proto_number == self.capsule_number: return inputs else: signals = inputs[0] diss = inputs[1] signal_shape = mixed_shape(signals) if self.use_for_loop: diss_stack = [] signals_stack = [] sub_idx = None with K.name_scope('for_loop'): for p in self._proto_distrib: with K.name_scope('compute_slices'): diss_ = diss[:, p[0]:(p[-1] + 1)] signals_ = K.reshape( signals[:, p[0]:(p[-1] + 1), :], [signal_shape[0] * len(p)] + list(signal_shape[2:])) with K.name_scope('competition'): if len(p) > 1: with K.name_scope('competition_indices'): argmin_idx = K.argmin(diss_, axis=-1) if sub_idx is None: sub_idx = K.arange( 0, signal_shape[0], dtype=argmin_idx.dtype) argmin_idx = argmin_idx + len(p) * sub_idx with K.name_scope('dissimilarity_competition'): diss_stack.append( K.expand_dims( K.gather(K.flatten(diss_), argmin_idx), -1)) with K.name_scope('signal_competition'): signals_stack.append( K.gather(signals_, argmin_idx)) else: diss_stack.append(diss_) signals_stack.append(signals_) diss = K.concatenate(diss_stack, 1) with K.name_scope('signal_concatenation'): signals = K.concatenate(signals_stack, 1) signals = K.reshape( signals, [signal_shape[0], self.capsule_number] + list(signal_shape[2:])) else: with K.name_scope('dissimilarity_preprocessing'): # extend if it is not equally distributed if not self._equally_distributed: # permute to first dimension is prototype (protos x batch) diss = K.permute_dimensions(diss, [1, 0]) # gather regarding extension (preparing for reshape to block) diss = K.gather(diss, self._proto_extension) # permute back (max_proto_number x (max_proto_number * batch)) diss = K.permute_dimensions(diss, [1, 0]) # reshape to block form diss = K.reshape(diss, [ signal_shape[0] * self.capsule_number, self._max_proto_number_in_capsule ]) with K.name_scope('competition_indices'): # get minimal idx in each class and batch for element selection in diss and signals argmin_idx = K.argmin(diss, axis=-1) argmin_idx = argmin_idx + self._max_proto_number_in_capsule * \ K.arange(0, signal_shape[0] * self.capsule_number, dtype=argmin_idx.dtype) with K.name_scope('dissimilarity_competition'): # get minimal values in the form (batch x capsule) diss = K.gather(K.flatten(diss), argmin_idx) diss = K.reshape(diss, [signal_shape[0], self.capsule_number]) with K.name_scope('signal_preprocessing'): # apply the same steps as above for signals # get signals in: (batch x protos x dim1 x ... x dimN) --> out: (batch x capsule x dim1 x ... x dimN) # extend if is not equally distributed if not self._equally_distributed: signals = K.permute_dimensions( signals, [1, 0] + list(range(2, len(signal_shape)))) signals = K.gather(signals, self._proto_extension) signals = K.permute_dimensions( signals, [1, 0] + list(range(2, len(signal_shape)))) signals = K.reshape(signals, [ signal_shape[0] * self.capsule_number * self._max_proto_number_in_capsule ] + list(signal_shape[2:])) with K.name_scope('signal_competition'): signals = K.gather(signals, argmin_idx) signals = K.reshape( signals, [signal_shape[0], self.capsule_number] + list(signal_shape[2:])) return {0: signals, 1: diss}
def integrate_vec(vec, time_dep=False, method='ss', **kwargs): """ Integrate (stationary of time-dependent) vector field (N-D Tensor) in tensorflow Aside from directly using tensorflow's numerical integration odeint(), also implements "scaling and squaring", and quadrature. Note that the diff. equation given to odeint is the one used in quadrature. Parameters: vec: the Tensor field to integrate. If vol_size is the size of the intrinsic volume, and vol_ndim = len(vol_size), then vector shape (vec_shape) should be [vol_size, vol_ndim] (if stationary) [vol_size, vol_ndim, nb_time_steps] (if time dependent) time_dep: bool whether vector is time dependent method: 'scaling_and_squaring' or 'ss' or 'ode' or 'quadrature' if using 'scaling_and_squaring': currently only supports integrating to time point 1. nb_steps: int number of steps. Note that this means the vec field gets broken down to 2**nb_steps. so nb_steps of 0 means integral = vec. if using 'ode': out_time_pt (optional): a time point or list of time points at which to evaluate Default: 1 init (optional): if using 'ode', the initialization method. Currently only supporting 'zero'. Default: 'zero' ode_args (optional): dictionary of all other parameters for tf.contrib.integrate.odeint() Returns: int_vec: integral of vector field. Same shape as the input if method is 'scaling_and_squaring', 'ss', 'quadrature', or 'ode' with out_time_pt not a list. Will have shape [*vec_shape, len(out_time_pt)] if method is 'ode' with out_time_pt being a list. Todo: quadrature for more than just intrinsically out_time_pt = 1 """ if method not in ['ss', 'scaling_and_squaring', 'ode', 'quadrature']: raise ValueError( "method has to be 'scaling_and_squaring' or 'ode'. found: %s" % method) if method in ['ss', 'scaling_and_squaring']: nb_steps = kwargs['nb_steps'] assert nb_steps >= 0, 'nb_steps should be >= 0, found: %d' % nb_steps if time_dep: svec = K.permute_dimensions(vec, [-1, *range(0, vec.shape[-1] - 1)]) assert 2**nb_steps == svec.shape[ 0], "2**nb_steps and vector shape don't match" svec = svec / (2**nb_steps) for _ in range(nb_steps): svec = svec[0::2] + tf.map_fn(transform, svec[1::2, :], svec[0::2, :]) disp = svec[0, :] else: vec = vec / (2**nb_steps) for _ in range(nb_steps): vec += transform(vec, vec) disp = vec elif method == 'quadrature': # TODO: could output more than a single timepoint! nb_steps = kwargs['nb_steps'] assert nb_steps >= 1, 'nb_steps should be >= 1, found: %d' % nb_steps vec = vec / nb_steps if time_dep: disp = vec[..., 0] for si in range(nb_steps - 1): disp += transform(vec[..., si + 1], disp) else: disp = vec for _ in range(nb_steps - 1): disp += transform(vec, disp) else: assert not time_dep, "odeint not implemented with time-dependent vector field" fn = lambda disp, _: transform(vec, disp) # process time point. out_time_pt = kwargs['out_time_pt'] if 'out_time_pt' in kwargs.keys( ) else 1 single_out_time_pt = not isinstance(out_time_pt, (list, tuple)) if single_out_time_pt: out_time_pt = [out_time_pt] K_out_time_pt = K.variable([0, *out_time_pt]) # process initialization if 'init' not in kwargs.keys() or kwargs['init'] == 'zero': disp0 = vec * 0 else: raise ValueError('non-zero init for ode method not implemented') # compute integration with tf.contrib.integrate.odeint if 'ode_args' not in kwargs.keys(): kwargs['ode_args'] = {} disp = tf.contrib.integrate.odeint(fn, disp0, K_out_time_pt, **kwargs['ode_args']) disp = K.permute_dimensions(disp[1:len(out_time_pt) + 1, :], [*range(1, len(disp.shape)), 0]) # return if single_out_time_pt: disp = disp[..., 0] return disp
def get_cnn_model2(self): get_diag = Lambda( lambda xin: K.sum(xin * T.eye(self.max_opt_count), axis=2), output_shape=(self.max_opt_count, )) transp_out = Lambda(lambda xin: K.permute_dimensions(xin, (0, 2, 1)), output_shape=(self.max_opt_count, self.word_vec_size)) apply_weights = Lambda(lambda xin: (K.expand_dims(xin[ 0], axis=-1) * K.expand_dims(xin[1], axis=2)).sum(axis=1), output_shape=(self.word_vec_size, self.max_opt_count)) tile_q = Lambda(lambda xin: K.tile(xin, (1, self.max_opt_count, 1, 1)), output_shape=(self.max_opt_count, self.max_q_length, self.word_vec_size)) exp_dims = Lambda(lambda xin: K.expand_dims(xin, 1), output_shape=(1, self.max_q_length, self.word_vec_size)) exp_dims2 = Lambda(lambda xin: K.expand_dims(xin, 3), output_shape=(None, self.word_vec_size, 1)) exp_layer = Lambda(lambda xin: K.exp(xin), output_shape=(self.max_sent_para, self.max_opt_count)) final_weights = Lambda(lambda xin: xin / K.cast( K.sum(xin, axis=1, keepdims=True), K.floatx()), output_shape=(self.max_sent_para, self.max_opt_count)) mask_weights = Lambda(lambda xin: T.switch(T.eq(xin, 0), np.NINF, xin), output_shape=(self.max_sent_para, self.max_opt_count)) glob_pool = Lambda(lambda xin: K.mean(xin, axis=[1, 2]), output_shape=(100, )) filter_sizes = [2, 3, 4] num_filters = 100 q_input = Input(shape=(self.max_q_length, self.word_vec_size), name='question_input') q_exp = exp_dims(q_input) q_rep = tile_q(q_exp) option_input = Input(shape=(self.max_opt_count, self.max_option_length, self.word_vec_size), name='option_input') opt_q = Concatenate(axis=2)([q_rep, option_input]) cnn_input = Input(shape=(None, self.word_vec_size), name='cnn_input') cnn_reshape = exp_dims2(cnn_input) conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], self.word_vec_size), padding='valid', kernel_initializer='normal', activation='linear')(cnn_reshape) conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], self.word_vec_size), padding='valid', kernel_initializer='normal', activation='linear')(cnn_reshape) conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], self.word_vec_size), padding='valid', kernel_initializer='normal', activation='linear')(cnn_reshape) meanpool_0 = glob_pool(conv_0) meanpool_1 = glob_pool(conv_1) meanpool_2 = glob_pool(conv_2) concatenated_tensor = Concatenate(axis=1)( [meanpool_0, meanpool_1, meanpool_2]) cnn_model = Model(inputs=cnn_input, outputs=concatenated_tensor) cnn_td_opt = TimeDistributed(cnn_model)(opt_q) doc_input = Input(shape=(self.max_sent_para, self.max_words_sent, self.word_vec_size), name='doc_input') cnn_doc = TimeDistributed(cnn_model)(doc_input) att_wts = Dot(axes=2, normalize=True)([cnn_doc, cnn_td_opt]) att_wts = mask_weights(att_wts) att_wts = exp_layer(att_wts) att_wts = final_weights(att_wts) out = apply_weights([cnn_doc, att_wts]) out = transp_out(out) dp = Dot(axes=2, normalize=True)([out, cnn_td_opt]) out = get_diag(dp) probs = MaskedSoftmax()([out, option_input]) main_model = Model(inputs=[q_input, doc_input, option_input], outputs=probs) sgd = SGD(lr=0.1, decay=0., momentum=0., nesterov=False) main_model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) main_model.summary() return main_model
def ext_start(inputs): m = inputs[0] s = inputs[1] w = K.one_hot(s[:, 0] + l_s * s[:, 1], l_s * l_s) # (None, l_s * l_s) return K.transpose( K.sum(w * K.permute_dimensions(m, (1, 0, 2)), axis=2))
def T(x): return K.permute_dimensions(x, [0, 2, 1])
def call(self, x, mask=None): mask = K.cast(mask, 'float32') mask = K.repeat(mask, self.repeat_dim) mask = K.permute_dimensions(mask, (0, 2, 1)) return x * mask
def _call(self, inputs, **kwargs): if self.proto_number == self.capsule_number: return inputs else: signals = inputs[0] diss = inputs[1] signal_shape = None # signal.shape: (batch, proto_num, caps_dim1, ..., caps_dimN) if self.input_spec[0].ndim > 3: signal_shape = mixed_shape(signals) signals = K.reshape(signals, signal_shape[0:2] + (-1, )) if not self._equally_distributed: if self.use_for_loop: signals_stack = [] diss_stack = [] with K.name_scope('for_loop'): for i, p in enumerate(self._proto_distrib): with K.name_scope('compute_slices'): diss_ = diss[:, p[0]:(p[-1] + 1)] signals_ = signals[:, p[0]:(p[-1] + 1), :] if len(p) > 1: with K.name_scope('competition_probabilities'): coefficients = prob_trans.neg_softmax( diss_ * self.beta[i], axis=-1, max_stabilization=True) with K.name_scope('signal_competition'): signals_stack.append( K.expand_dims( K.batch_dot( coefficients, signals_, [1, 1]), 1)) with K.name_scope('dissimilarity_competition'): diss_stack.append( K.batch_dot(coefficients, diss_, [1, 1])) else: signals_stack.append(signals_) diss_stack.append(diss_) signals = K.concatenate(signals_stack, axis=1) diss = K.concatenate(diss_stack, axis=-1) else: extension_idx = [] for i in self._proto_extension: if i not in extension_idx: extension_idx.append(i) else: extension_idx.append( max(self._proto_extension) + 1) batch_size = K.shape( signals )[0] if signal_shape is None else signal_shape[0] # reshape to block with K.name_scope('competition_probabilities'): with K.name_scope('neg_softmax'): with K.name_scope('coefficients'): beta = K.gather(self.beta, self._capsule_extension) coefficients = -diss * beta # max stabilization coefficients = coefficients - K.max( coefficients, axis=-1, keepdims=True) coefficients = K.exp(coefficients) coefficients = K.concatenate([ coefficients, K.zeros_like(coefficients[:, 0:1]) ], axis=-1) coefficients = K.transpose(coefficients) coefficients = K.gather( coefficients, extension_idx) coefficients = K.transpose(coefficients) coefficients = K.reshape( coefficients, [ batch_size, self.capsule_number, self._max_proto_number_in_capsule ]) # could never be a zero division with K.name_scope('normalization_constant'): constant = K.sum(coefficients, axis=-1, keepdims=True) probs = coefficients / constant with K.name_scope('dissimilarity_preprocessing'): diss = K.transpose(diss) diss = K.gather(diss, self._proto_extension) diss = K.transpose(diss) diss = K.reshape(diss, [ batch_size, self.capsule_number, self._max_proto_number_in_capsule ]) with K.name_scope('dissimilarity_competition'): diss = K.squeeze( K.batch_dot(probs, K.expand_dims(diss), [2, 2]), -1) with K.name_scope('signal_preprocessing'): signals = K.permute_dimensions(signals, [1, 0, 2]) signals = K.gather(signals, self._proto_extension) signals = K.permute_dimensions(signals, [1, 0, 2]) signals = K.reshape(signals, [ batch_size, self.capsule_number, self._max_proto_number_in_capsule, -1 ]) with K.name_scope('signal_competition'): signals = K.batch_dot(probs, signals, [2, 2]) else: batch_size = K.shape( signals)[0] if signal_shape is None else signal_shape[0] diss = K.reshape(diss, [ batch_size, self.capsule_number, self._max_proto_number_in_capsule ]) with K.name_scope('competition_probabilities'): coefficients = prob_trans.neg_softmax( diss * K.expand_dims(self.beta, -1), axis=-1, max_stabilization=True) with K.name_scope('signal_competition'): signals = K.reshape(signals, [ batch_size, self.capsule_number, self._max_proto_number_in_capsule, -1 ]) signals = K.batch_dot(coefficients, signals, [2, 2]) with K.name_scope('dissimilarity_competition'): diss = K.squeeze( K.batch_dot(coefficients, K.expand_dims(diss), [2, 2]), -1) if self.input_spec[0].ndim > 3: signals = K.reshape(signals, [signal_shape[0], self.capsule_number] + list(signal_shape[2:])) return {0: signals, 1: diss}
def call(self, inputs, mask=None, training=None): (inputs, content, memories, segment_mat, segment_embed, relatives, bias_context, bias_relative, bias_segment, permutation) = inputs full = K.concatenate([memories, content], axis=1) # (batch, prev_len + seq_len, units) kernel_q = self.kernel[:, :self.units] kernel_kv = self.kernel[:, self.units:self.units * 3] kernel_r = self.kernel[:, self.units * 3:self.units * 4] kernel_o = self.kernel[:, self.units * 4:self.units * 5] bias_q, bias_kv, bias_r, bias_o = (None, ) * 4 if self.use_bias: bias_q = self.bias[:self.units] bias_kv = self.bias[self.units:self.units * 3] bias_r = self.bias[self.units * 3:self.units * 4] bias_o = self.bias[self.units * 4:self.units * 5] w_q = K.dot(inputs, kernel_q) # (batch, seq_len, units) w_kv = K.dot(full, kernel_kv) # (batch, prev_len + seq_len, units * 2) w_r = K.dot(relatives, kernel_r) # (batch, prev_len + seq_len, units) if self.use_bias: w_q = K.bias_add(w_q, bias_q) w_kv = K.bias_add(w_kv, bias_kv) w_r = K.bias_add(w_r, bias_r) if self.activation is not None: w_q = self.activation(w_q) w_kv = self.activation(w_kv) w_r = self.activation(w_r) w_k = w_kv[:, :, :self.units] # (batch, prev_len + seq_len, units) w_v = w_kv[:, :, self.units:] # (batch, prev_len + seq_len, units) batch_size, q_len, k_len = K.shape(inputs)[0], K.shape( w_q)[1], K.shape(w_k)[1] w_qc = K.bias_add(w_q, bias_context) w_qc = self._reshape_to_batches( w_qc) # (batch * n_head, seq_len, units_head) w_k = self._reshape_to_batches( w_k) # (batch * n_head, prev_len + seq_len, units_head) a_context = K.batch_dot( w_qc, w_k, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) w_qr = K.bias_add(w_q, bias_relative) w_qr = self._reshape_to_batches( w_qr) # (batch * n_head, seq_len, units_head) w_r = self._reshape_to_batches( w_r) # (batch * n_head, prev_len + seq_len, units_head) a_relative = K.batch_dot( w_qr, w_r, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) a_relative = self._relative_shift( # (batch * n_head, seq_len, prev_len + seq_len) a_relative, key_len_expected=K.shape(a_context)[-1], ) w_qs = K.bias_add(w_q, bias_segment) w_qs = K.reshape(w_qs, (-1, q_len, self.num_head, self.units_head)) w_qs = K.permute_dimensions( w_qs, (2, 0, 1, 3)) # (n_head, batch, seq_len, units_head) segment_embed = K.reshape(K.transpose(segment_embed), (self.num_head, 1, self.units_head, 2)) segment_embed = K.tile(segment_embed, (1, batch_size, 1, 1)) w_qs = K.reshape(w_qs, (-1, q_len, self.units_head)) segment_embed = K.reshape(segment_embed, (-1, self.units_head, 2)) a_segment = K.batch_dot(w_qs, segment_embed, axes=(2, 1)) # (n_head * batch, seq_len, 2) a_segment = K.reshape(a_segment, (self.num_head, batch_size, q_len, 2)) a_segment = K.permute_dimensions( a_segment, (1, 2, 3, 0)) # (batch, seq_len, 2, n_head) segment_mat = K.reshape( segment_mat, (-1, k_len, 2)) # (batch * seq_len, prev_len + seq_len, 2) a_segment = K.reshape( a_segment, (-1, 2, self.num_head)) # (batch * seq_len, 2, n_head) a_segment = K.batch_dot( segment_mat, a_segment, axes=(2, 1)) # (batch * seq_len, prev_len + seq_len, n_head) a_segment = K.reshape(a_segment, (-1, q_len, k_len, self.num_head)) a_segment = K.reshape(K.permute_dimensions(a_segment, (0, 3, 1, 2)), (-1, q_len, k_len)) att = (a_context + a_relative + a_segment) / K.sqrt( K.constant(self.units_head, dtype=K.floatx())) exp = K.exp(att - K.max(att, axis=-1, keepdims=True)) permutation = K.tile(K.expand_dims(permutation, axis=1), [1, self.num_head, 1, 1]) permutation = K.reshape(permutation, (-1, q_len, k_len)) exp *= permutation if mask is not None and mask[0] is not None: mask = K.cast(mask[0], K.floatx()) mask = K.concatenate([K.ones_like(memories[:, :, 0]), mask], axis=1) exp *= K.expand_dims(self._reshape_mask(mask), axis=1) att = exp / (K.sum(exp, axis=-1, keepdims=True) + K.epsilon()) if self.att_drop_layer is not None: att = self.att_drop_layer(att, training=training) w_v = self._reshape_to_batches( w_v) # (batch * n_head, prev_len + seq_len, units_head) w_o = K.batch_dot(att, w_v) # (batch * n_head, seq_len, units_head) w_o = self._reshape_from_batches(w_o) # (batch, seq_len, units) w_o = K.dot(w_o, kernel_o) # (batch, seq_len, units) if self.use_bias: w_o = K.bias_add(w_o, bias_o) if self.activation is not None: w_o = self.activation(w_o) if TF_KERAS: # Add shape information to tensor when using `tf.keras` input_shape = K.int_shape(inputs) if input_shape[1] is not None: w_o = K.reshape(w_o, (-1, ) + input_shape[1:]) return w_o
def transpose_3tensor(t): ''' Lambda function to switch dimensions of input in order to properly take the dot product and return the necessary relationships ''' return K.permute_dimensions(t, (0, 2, 1))
from keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255., x_test / 255. print('X_train shape: {}'.format(x_train.shape)) print('X_test shape: {}'.format(x_test.shape)) print('y_train shape: {}'.format(y_train.shape)) print('y_test shape: {}'.format(y_test.shape)) D = 28 M = 15 inputs = Input(shape=(D, D)) x1 = Bidirectional(LSTM(M, return_sequences=True))(inputs) x1 = GlobalMaxPooling1D()(x1) permutor = Lambda(lambda t: K.permute_dimensions(t, pattern=(0, 2, 1))) x2 = permutor(inputs) x2 = Bidirectional(LSTM(M, return_sequences=True))(x2) x2 = GlobalMaxPooling1D()(x2) x = Concatenate(axis=1)([x1, x2]) outputs = Dense(10, activation='softmax')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) print(model.summary()) history = model.fit(x_train, y_train, batch_size=256, epochs=5,