def call(self, inputs, **kwargs): gate = kb.dot(inputs, self.gate_kernel) gate = kb.bias_add(gate, self.gate_bias, data_format="channels_last") gate = self.activation(gate) new_value = kb.dot(inputs, self.dense_kernel) new_value = kb.bias_add(new_value, self.dense_bias, data_format="channels_last") return gate * new_value + (1.0 - gate) * inputs
def call(self, inputs, **kwargs): assert isinstance(inputs, list) and len(inputs) == 3 first, second, features = inputs[0], inputs[1], inputs[2] if not self.from_logits: first = kb.clip(first, 1e-10, 1.0) second = kb.clip(second, 1e-10, 1.0) first_, second_ = kb.log(first), kb.log(second) else: first_, second_ = first, second # embedded_features.shape = (M, T, 1) if self.use_intermediate_layer: features = kb.dot(features, self.first_kernel) features = kb.bias_add(features, self.first_bias, data_format="channels_last") features = self.intermediate_activation(features) embedded_features = kb.dot(features, self.features_kernel) embedded_features = kb.bias_add( embedded_features, self.features_bias, data_format="channels_last") if self.use_dimension_bias: tiling_shape = [1] * (kb.ndim(first)-1) + [kb.shape(first)[-1]] embedded_features = kb.tile(embedded_features, tiling_shape) embedded_features = kb.bias_add( embedded_features, self.dimensions_bias, data_format="channels_last") sigma = kb.sigmoid(embedded_features) result = weighted_sum(first_, second_, sigma, self.first_threshold, self.second_threshold) probs = kb.softmax(result) if self.return_logits: return [probs, result] return probs
def _time_distributed_dense(w, x, b): if K.backend() == 'tensorflow': x = K.dot(x, w) x = K.bias_add(x, b) else: print("time_distributed_dense doesn't backend tensorflow") return x
def call(self, inputs): if self.tied_to is not None: outputs = K.conv1d( inputs, self.tied_to.kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) else: # this branch is typically entered when a previously trained model is being loaded again outputs = K.conv1d( inputs, self.learnedKernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, inputs): if self.rank == 1: outputs = K.conv1d( inputs, self.kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d( inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d( inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, inputs): output = K.dot(inputs, self.kernel) if self.use_bias: output = K.bias_add(output, self.bias) if self.activation is not None: output = self.activation(output) return output
def call(self, inputs): filter_in_group = self.filters / self.num_group if self.data_format == 'channels_first': channel_axis = 1 input_in_group = self.channel_num / self.num_group outputs_list = [] for i in range(self.num_group): outputs = K.conv2d( inputs[:,i*input_in_group:(i+1)*input_in_group,:,:], self.kernel[:, :, :, i*filter_in_group:(i+1)*filter_in_group], strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias[i*filter_in_group:(i+1)*filter_in_group], data_format=self.data_format) outputs_list.append(outputs) elif self.data_format == 'channels_last': outputs_list = [] channel_axis = -1 input_in_group = self.channel_num / self.num_group for i in range(self.num_group): outputs = K.conv2d( inputs[:, :, :, i*input_in_group:(i+1)*input_in_group], self.kernel[:, :, :, i*filter_in_group:(i+1)*filter_in_group], strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias[i*filter_in_group:(i+1)*filter_in_group], data_format=self.data_format) outputs_list.append(outputs) outputs = concatenate(outputs_list, axis=channel_axis) return outputs
def call(self, inputs): output = self.local_conv3d(inputs, self.kernel, self.kernel_size, self.strides, (self.output_row, self.output_col, self.output_z), self.data_format) if self.use_bias: output = K.bias_add(output, self.bias, data_format=self.data_format) output = self.activation(output) return output
def call(self, inputs): _, _, filters = self.kernel_shape output = K.local_conv2d(inputs, self.kernel, self.kernel_size, self.strides, (self.output_row, self.output_col), self.data_format) if self.use_bias: if self.data_format == 'channels_first' or self.data_format == 'channels_last': output = K.bias_add(output, self.bias, data_format=self.data_format) output = self.activation(output) return output
def call(self, x): # sample from noise distribution e_i = K.random_normal((self.input_dim, self.units)) e_j = K.random_normal((self.units,)) # We use the factorized Gaussian noise variant from Section 3 of Fortunato et al. eW = K.sign(e_i) * (K.sqrt(K.abs(e_i))) * K.sign(e_j) * (K.sqrt(K.abs(e_j))) eB = K.sign(e_j) * (K.abs(e_j) ** (1 / 2)) noise_injected_weights = K.dot(x, self.mu_weight + (self.sigma_weight * eW)) noise_injected_bias = self.mu_bias + (self.sigma_bias * eB) output = K.bias_add(noise_injected_weights, noise_injected_bias) if self.activation is not None: output = self.activation(output) return output
def call(self, inputs, training=None): outputs = K.depthwise_conv2d( inputs, self.depthwise_kernel, strides=self.strides, padding=self.padding, dilation_rate=self.dilation_rate, data_format=self.data_format) if self.bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, x, mask=None): # size of x :[batch_size, sel_len, attention_dim] # size of u :[batch_size, attention_dim] # uit = tanh(xW+b) uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) ait = K.dot(uit, self.u) ait = K.squeeze(ait, -1) ait = K.exp(ait) if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano ait *= K.cast(mask, K.floatx()) ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) ait = K.expand_dims(ait) weighted_input = x * ait output = K.sum(weighted_input, axis=1) return output
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): """Apply `y . w + b` for every temporal slice y of x. # Arguments x: input tensor. w: weight matrix. b: optional bias vector. dropout: wether to apply dropout (same dropout mask for every temporal slice of the input). input_dim: integer; optional dimensionality of the input. output_dim: integer; optional dimensionality of the output. timesteps: integer; optional number of timesteps. training: training phase tensor or boolean. # Returns Output tensor. """ if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) # reshape to 3D tensor if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, inputs): assert self.rank == 2, 'only conv2d supported for now...' if self.rank == 2: outputs = K.conv2d( inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) #if self.activation is not None: # assert False,'activation functions not supported' # return self.activation(outputs) return outputs
def bias_add(self, b): self.c = K.bias_add(self.c, b) return self
def call(self, inputs, states, training=None): # previous memory state for gru h_tm1 = states[0] # generate our dropout and recurrent dropout masks if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask( K.ones_like(inputs), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( K.ones_like(states[0]), self.recurrent_dropout, training=training, count=4) # get the dropout mask for input units dp_mask = self._dropout_mask # get the dropout mask for recurrent units rec_dp_mask = self._recurrent_dropout_mask #### # we need to both attend and align in our attention model # alignment model h_att = K.repeat(h_tm1, self.timestep_dim) att = _time_distributed_dense(inputs, self.attention_weights, self.attention_bias, input_dim=self.input_dim, output_dim=self.units, timesteps=self.timestep_dim) # attention energy en = K.dot(h_att, self.attention_recurrent_weights) + att attention_ = self.attention_activation(en) attention_ = K.squeeze(K.dot(attention_, self.attention_recurrent_bias), 2) alpha = K.exp(attention_) # apply dropout to the attention layer if dp_mask is not None: alpha *= dp_mask[0] alpha /= K.sum(alpha, axis=1, keepdims=True) alpha_r = K.repeat(alpha, self.input_dim) alpha_r = K.permute_dimensions(alpha_r, (0, 2, 1)) # make context vector (soft attention after Bahdanau et al.) z_hat = inputs * alpha_r context_sequence = z_hat z_hat = K.sum(z_hat, axis=1) #### # choose the implementation ... implementation 1 is easier to read :) if self.implementation == 1: # apply dropout if 0 < self.dropout < 1.: inputs_z = inputs * dp_mask[0] inputs_r = inputs * dp_mask[1] inputs_h = inputs * dp_mask[2] else: inputs_z = inputs inputs_r = inputs inputs_h = inputs # weight the inputs by the kernel weights x_z = K.dot(inputs_z, self.kernel_z) x_r = K.dot(inputs_r, self.kernel_r) x_h = K.dot(inputs_h, self.kernel_h) # add biases if self.use_bias: x_z = K.bias_add(x_z, self.bias_z) x_r = K.bias_add(x_r, self.bias_r) x_h = K.bias_add(x_h, self.bias_h) # apply recurrent dropout if 0 < self.recurrent_dropout < 1.: h_tm1_z = h_tm1 * rec_dp_mask[0] h_tm1_r = h_tm1 * rec_dp_mask[1] h_tm1_h = h_tm1 * rec_dp_mask[2] else: h_tm1_z = h_tm1 h_tm1_r = h_tm1 h_tm1_h = h_tm1 # do the gru gating operations - adding the appropriate attention # term as we go # first calculate the recurrent parts recurrent_z = K.dot(h_tm1_z, self.recurrent_kernel_z) recurrent_r = K.dot(h_tm1_r, self.recurrent_kernel_r) # if we are using the cudnn form (reset after multiplication) then # add applicable recurrent biases here if self.reset_after and self.use_bias: recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias_z) recurrent_r = K.bias_add(recurrent_r, self.recurrent_bias_r) # add attention to z z = x_z + recurrent_z + K.dot(z_hat, self.attention_z) z = self.recurrent_activation(z) # add attention to r r = x_z + recurrent_r + K.dot(z_hat, self.attention_r) r = self.recurrent_activation(r) # manage cudnn compatibility # reset gate applied after matrix multiplication if self.reset_after: recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel_h) if self.use_bias: recurrent_h = K.bias_add(recurrent_h, self.recurrent_bias_h) recurrent_h = r * recurrent_h # reset gate applied before matrix multiplication else: recurrent_h = K.dot(r * h_tm1_h, self.recurrent_kernel_h) # apply attention and activation hh = self.activation(x_h + recurrent_h + K.dot(z_hat, self.attention_h)) # implementation 2 involves batching stuff up more and *might* be more # efficient (depending on hardware) else: # apply dropout if 0. < self.dropout < 1.: inputs *= dp_mask[0] # weight the inputs by the kernel matrix_x = K.dot(inputs, self.kernel) # apply biases if self.use_bias: matrix_x = K.bias_add(matrix_x, self.bias) # extract the z, r, h parts x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units: 2 * self.units] x_h = matrix_x[:, 2 * self.units:] # apply recurrent dropout if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] # manage cudnn compatibility # reset gate applied after matrix multiplication if self.reset_after: # hidden state projected by all gate matrices at once matrix_inner = K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias) # reset gate applied before matrix multiplication else: # hidden state projected separately for update/reset and new matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units]) recurrent_z = matrix_inner[:, :self.units] recurrent_r = matrix_inner[:, self.units: 2 * self.units] # apply attention and then the recurrent activation function z = self.recurrent_activation(x_z + recurrent_z + K.dot(z_hat, self.attention_z)) r = self.recurrent_activation(x_r + recurrent_r + K.dot(z_hat, self.attention_r)) # manage cudnn compatibility # reset gate applied after matrix multiplication if self.reset_after: recurrent_h = r * matrix_inner[:, 2 * self.units:] # reset gate applied before matrix multiplication else: recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:]) # apply attention and activation hh = self.activation(x_h + recurrent_h + K.dot(z_hat, self.attention_h)) # get the final hidden state by mixing up the previous and candidate # state in the update gate h = z * h_tm1 + (1 - z) * hh # set the learning phase (not sure what we're doing here - but the # keras gru code does this ...) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True # return the gru states return h, [h]
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, K.shape(inputs)[-1]), self.dropout, training=training, count=8) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): _recurrent_dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, self.units), self.recurrent_dropout, training=training, count=8) self._recurrent_dropout_mask = _recurrent_dropout_mask # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state if self.implementation == 1: if 0 < self.dropout < 1.: inputs_0 = inputs * dp_mask[0] inputs_1 = inputs * dp_mask[1] inputs_2 = inputs * dp_mask[2] inputs_3 = inputs * dp_mask[3] inputs_4 = inputs * dp_mask[4] inputs_5 = inputs * dp_mask[5] inputs_6 = inputs * dp_mask[6] inputs_7 = inputs * dp_mask[7] else: inputs_0 = inputs inputs_1 = inputs inputs_2 = inputs inputs_3 = inputs inputs_4 = inputs inputs_5 = inputs inputs_6 = inputs inputs_7 = inputs x_0 = K.dot(inputs_0, self.kernel_0) x_1 = K.dot(inputs_1, self.kernel_1) x_2 = K.dot(inputs_2, self.kernel_2) x_3 = K.dot(inputs_3, self.kernel_3) x_4 = K.dot(inputs_4, self.kernel_4) x_5 = K.dot(inputs_5, self.kernel_5) x_6 = K.dot(inputs_6, self.kernel_6) x_7 = K.dot(inputs_7, self.kernel_7) if self.use_bias: x_0 = K.bias_add(x_0, self.bias_0) x_1 = K.bias_add(x_1, self.bias_1) x_2 = K.bias_add(x_2, self.bias_2) x_3 = K.bias_add(x_3, self.bias_3) x_4 = K.bias_add(x_4, self.bias_4) x_5 = K.bias_add(x_5, self.bias_5) x_6 = K.bias_add(x_6, self.bias_6) x_7 = K.bias_add(x_7, self.bias_7) if 0 < self.recurrent_dropout < 1.: h_tm1_0 = h_tm1 * rec_dp_mask[0] h_tm1_1 = h_tm1 * rec_dp_mask[1] h_tm1_2 = h_tm1 * rec_dp_mask[2] h_tm1_3 = h_tm1 * rec_dp_mask[3] h_tm1_4 = h_tm1 * rec_dp_mask[4] h_tm1_5 = h_tm1 * rec_dp_mask[5] h_tm1_6 = h_tm1 * rec_dp_mask[6] h_tm1_7 = h_tm1 * rec_dp_mask[7] else: h_tm1_0 = h_tm1 h_tm1_1 = h_tm1 h_tm1_2 = h_tm1 h_tm1_3 = h_tm1 h_tm1_4 = h_tm1 h_tm1_5 = h_tm1 h_tm1_6 = h_tm1 h_tm1_7 = h_tm1 # First Layer layer1_0 = self.recurrent_activation(x_0 + K.dot(h_tm1_0, self.recurrent_kernel_0)) layer1_1 = self.cell_activation(x_1 + K.dot(h_tm1_1, self.recurrent_kernel_1)) layer1_2 = self.recurrent_activation(x_2 + K.dot(h_tm1_2, self.recurrent_kernel_2)) layer1_3 = self.cell_activation(x_3 * K.dot(h_tm1_3, self.recurrent_kernel_3)) layer1_4 = self.activation(x_4 + K.dot(h_tm1_4, self.recurrent_kernel_4)) layer1_5 = self.recurrent_activation(x_5 + K.dot(h_tm1_5, self.recurrent_kernel_5)) layer1_6 = self.activation(x_6 + K.dot(h_tm1_6, self.recurrent_kernel_6)) layer1_7 = self.recurrent_activation(x_7 + K.dot(h_tm1_7, self.recurrent_kernel_7)) # Second Layer layer2_0 = self.activation(layer1_0 * layer1_1) layer2_1 = self.activation(layer1_2 + layer1_3) layer2_2 = self.activation(layer1_4 * layer1_5) layer2_3 = self.recurrent_activation(layer1_6 + layer1_7) # Inject the Cell layer2_0 = self.activation(layer2_0 + c_tm1) # Third Layer layer3_0_pre = layer2_0 * layer2_1 c = layer3_0_pre # create a new cell layer3_0 = layer3_0_pre layer3_1 = self.activation(layer2_2 + layer2_3) # Final Layer h = self.activation(layer3_0 * layer3_1) if self.projection_units is not None: h = self.projection_activation(K.dot(h, self.projection_kernel)) else: if 0. < self.dropout < 1.: inputs *= dp_mask[0] z = K.dot(inputs, self.kernel) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] zr = K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: zr = K.bias_add(zr, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units: 2 * self.units] z2 = z[:, 2 * self.units: 3 * self.units] z3 = z[:, 3 * self.units: 4 * self.units] z4 = z[:, 4 * self.units: 5 * self.units] z5 = z[:, 5 * self.units: 6 * self.units] z6 = z[:, 6 * self.units: 7 * self.units] z7 = z[:, 7 * self.units:] zr0 = zr[:, :self.units] zr1 = zr[:, self.units: 2 * self.units] zr2 = zr[:, 2 * self.units: 3 * self.units] zr3 = zr[:, 3 * self.units: 4 * self.units] zr4 = zr[:, 4 * self.units: 5 * self.units] zr5 = zr[:, 5 * self.units: 6 * self.units] zr6 = zr[:, 6 * self.units: 7 * self.units] zr7 = zr[:, 7 * self.units:] # First Layer layer1_0 = self.recurrent_activation(z0 + zr0) layer1_1 = self.cell_activation(z1 + zr1) layer1_2 = self.recurrent_activation(z2 + zr2) layer1_3 = self.cell_activation(z3 * zr3) layer1_4 = self.activation(z4 + zr4) layer1_5 = self.recurrent_activation(z5 + zr5) layer1_6 = self.activation(z6 + zr6) layer1_7 = self.recurrent_activation(z7 + zr7) # Second Layer layer2_0 = self.activation(layer1_0 * layer1_1) layer2_1 = self.activation(layer1_2 + layer1_3) layer2_2 = self.activation(layer1_4 * layer1_5) layer2_3 = self.recurrent_activation(layer1_6 + layer1_7) # Inject the Cell layer2_0 = self.activation(layer2_0 + c_tm1) # Third Layer layer3_0_pre = layer2_0 * layer2_1 c = layer3_0_pre layer3_0 = layer3_0_pre layer3_1 = self.activation(layer2_2 + layer2_3) # Final Layer h = self.activation(layer3_0 * layer3_1) if self.projection_units is not None: h = self.projection_activation(K.dot(h, self.projection_kernel)) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h, c]
def step(self, inputs, states): h_tm1 = states[0] c_tm1 = states[1] dp_mask = states[2] rec_dp_mask = states[3] x_input = states[4] # alignment model h_att = K.repeat(h_tm1, self.timestep_dim) att = _time_distributed_dense(x_input, self.attention_weights, self.attention_bias, output_dim=K.int_shape(self.attention_weights)[1]) attention_ = self.attention_activation(K.dot(h_att, self.attention_recurrent_weights) + att) # energy attention_ = K.squeeze(K.dot(attention_, self.attention_recurrent_bias), 2) # energy alpha = K.exp(attention_) if dp_mask is not None: alpha *= dp_mask[0] alpha /= K.sum(alpha, axis=1, keepdims=True) alpha_r = K.repeat(alpha, self.input_dim) alpha_r = K.permute_dimensions(alpha_r, (0, 2, 1)) # make context vector (soft attention after Bahdanau et al.) z_hat = x_input * alpha_r context_sequence = z_hat z_hat = K.sum(z_hat, axis=1) if self.implementation == 2: z = K.dot(inputs * dp_mask[0], self.kernel) z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel) z += K.dot(z_hat, self.attention_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units: 2 * self.units] z2 = z[:, 2 * self.units: 3 * self.units] z3 = z[:, 3 * self.units:] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) else: if self.implementation == 0: x_i = inputs[:, :self.units] x_f = inputs[:, self.units: 2 * self.units] x_c = inputs[:, 2 * self.units: 3 * self.units] x_o = inputs[:, 3 * self.units:] elif self.implementation == 1: x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o else: raise ValueError('Unknown `implementation` mode.') i = self.recurrent_activation(x_i + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i) + K.dot(z_hat, self.attention_i)) f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f) + K.dot(z_hat, self.attention_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c) + K.dot(z_hat, self.attention_c)) o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o) + K.dot(z_hat, self.attention_o)) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True if self.return_attention: return context_sequence, [h, c] else: return h, [h, c]
def call(self, inputs): # Note that the following did not have the linear term of FM component ans1 = K.sum(inputs[0], axis = 1, keepdims = True) ans2 = K.bias_add(K.dot(inputs[1], self.kernel), self.bias) #return K.sigmoid(ans1 + ans2 + inputs[2]) return K.sigmoid(ans1 + ans2)
def call(self, inputs): output = K.dot(inputs[0], self.kernel) output1 = K.batch_dot(output, inputs[1]) output2 = K.bias_add(output1, self.bias) output2 = self.activation(output2) return output2
def call(self, inputs, states, training=None): h_tm1 = (states[0] if tf.nest.is_nested(states) else states ) # previous memory dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=3) rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(h_tm1, training, count=3) if self.use_bias: if not self.reset_after: input_bias, recurrent_bias = self.bias, None else: input_bias, recurrent_bias = tf.unstack(self.bias) if self.implementation == 1: if 0.0 < self.dropout < 1.0: inputs_z = inputs * dp_mask[0] inputs_r = inputs * dp_mask[1] inputs_h = inputs * dp_mask[2] else: inputs_z = inputs inputs_r = inputs inputs_h = inputs x_z = backend.dot(inputs_z, self.kernel[:, :self.units]) x_r = backend.dot(inputs_r, self.kernel[:, self.units:self.units * 2]) x_h = backend.dot(inputs_h, self.kernel[:, self.units * 2:]) if self.use_bias: x_z = backend.bias_add(x_z, input_bias[:self.units]) x_r = backend.bias_add(x_r, input_bias[self.units:self.units * 2]) x_h = backend.bias_add(x_h, input_bias[self.units * 2:]) if 0.0 < self.recurrent_dropout < 1.0: h_tm1_z = h_tm1 * rec_dp_mask[0] h_tm1_r = h_tm1 * rec_dp_mask[1] h_tm1_h = h_tm1 * rec_dp_mask[2] else: h_tm1_z = h_tm1 h_tm1_r = h_tm1 h_tm1_h = h_tm1 recurrent_z = backend.dot(h_tm1_z, self.recurrent_kernel[:, :self.units]) recurrent_r = backend.dot( h_tm1_r, self.recurrent_kernel[:, self.units:self.units * 2]) if self.reset_after and self.use_bias: recurrent_z = backend.bias_add(recurrent_z, recurrent_bias[:self.units]) recurrent_r = backend.bias_add( recurrent_r, recurrent_bias[self.units:self.units * 2]) z = self.recurrent_activation(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) # reset gate applied after/before matrix multiplication if self.reset_after: recurrent_h = backend.dot( h_tm1_h, self.recurrent_kernel[:, self.units * 2:]) if self.use_bias: recurrent_h = backend.bias_add( recurrent_h, recurrent_bias[self.units * 2:]) recurrent_h = r * recurrent_h else: recurrent_h = backend.dot( r * h_tm1_h, self.recurrent_kernel[:, self.units * 2:]) hh = self.activation(x_h + recurrent_h) else: if 0.0 < self.dropout < 1.0: inputs = inputs * dp_mask[0] # inputs projected by all gate matrices at once matrix_x = backend.dot(inputs, self.kernel) if self.use_bias: # biases: bias_z_i, bias_r_i, bias_h_i matrix_x = backend.bias_add(matrix_x, input_bias) x_z, x_r, x_h = tf.split(matrix_x, 3, axis=-1) if self.reset_after: # hidden state projected by all gate matrices at once matrix_inner = backend.dot(h_tm1, self.recurrent_kernel) if self.use_bias: matrix_inner = backend.bias_add(matrix_inner, recurrent_bias) else: # hidden state projected separately for update/reset and new matrix_inner = backend.dot( h_tm1, self.recurrent_kernel[:, :2 * self.units]) recurrent_z, recurrent_r, recurrent_h = tf.split( matrix_inner, [self.units, self.units, -1], axis=-1) z = self.recurrent_activation(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) if self.reset_after: recurrent_h = r * recurrent_h else: recurrent_h = backend.dot( r * h_tm1, self.recurrent_kernel[:, 2 * self.units:]) hh = self.activation(x_h + recurrent_h) # previous and candidate state mixed by update gate h = z * h_tm1 + (1 - z) * hh new_state = [h] if tf.nest.is_nested(states) else h return h, new_state
def call(self, inputs, states, training=None): # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask z_ = states[0] Im_s_ = states[1] Re_s_ = states[2] omg_ = states[3] # only need one column of each matrix since they're all the same omg_ = omg_[:, :, 1] z_ = z_[:, :, 1] inputs_ = inputs[0] t_ = inputs[1] if 0. < self.dropout < 1.: inputs_i = inputs_* dp_mask[0] inputs_state = inputs_* dp_mask[1] inputs_freq = inputs_* dp_mask[2] inputs_g = inputs_* dp_mask[3] inputs_omg = inputs_* dp_mask[4] inputs_o = inputs_* dp_mask[5] else: inputs_i = inputs_ inputs_freq = inputs_ inputs_state = inputs_ inputs_g = inputs_ inputs_omg = inputs_ inputs_o = inputs_ x_i = K.dot(inputs_i, self.kernel_i) x_freq = K.dot(inputs_freq, self.kernel_freq) x_state = K.dot(inputs_state, self.kernel_state) x_g = K.dot(inputs_g, self.kernel_g) x_omg = K.dot(inputs_omg, self.kernel_omg) if self.use_bias: x_i = K.bias_add(x_i, self.bias_i) x_freq = K.bias_add(x_freq, self.bias_f) x_state = K.bias_add(x_state, self.bias_s) x_g = K.bias_add(x_g, self.bias_g) x_omg = K.bias_add(x_omg, self.bias_omg) if 0. < self.recurrent_dropout < 1.: z_i = z_ * rec_dp_mask[0] z_freq = z_ * rec_dp_mask[1] z_state = z_ * rec_dp_mask[2] z_g = z_ * rec_dp_mask[3] z_omg = z_ * rec_dp_mask[4] z_o = z_ * rec_dp_mask[5] else: z_i = z_ z_freq = z_ z_state = z_ z_g = z_ z_omg = z_ z_o = z_ freq = self.recurrent_activation(x_freq + K.dot(z_freq, self.recurrent_kernel_freq)) state = self.recurrent_activation(x_state + K.dot(z_state, self.recurrent_kernel_state)) combined_forget_gate = self.outer_product(freq, state) i = self.recurrent_activation(x_i + K.dot(z_i, self.recurrent_kernel_i)) g = K.tanh(x_g + K.dot(z_g, self.recurrent_kernel_g)) omega = x_omg + K.dot(z_omg, self.recurrent_kernel_omg) real_s = combined_forget_gate * Re_s_ + self.outer_product(i * g, K.cos(omg_ * t_)) img_s = combined_forget_gate * Im_s_ + self.outer_product(i * g, K.sin(omg_ * t_)) amplitude = K.sqrt(K.square(real_s) + K.square(img_s)) # transpose to dimensions (frequency_components, samples, state) for tf.scan amplitude = tf.transpose(amplitude, perm=[1, 0, 2]) def __freq(z_k, inputs_): U_k, W_k, V_k, b_k, W_z_k, b_z_k, A_k = inputs_ o = self.recurrent_activation(K.dot(A_k, U_k) + K.dot(inputs_o, W_k) + K.dot(z_o, V_k) + b_k) zz = z_k + o * K.tanh(K.dot(A_k, W_z_k) + b_z_k) return tf.stack(zz) h = tf.scan(__freq,elems=[self.frequency_kernel_U,self.frequency_kernel_W, self.frequency_kernel_V,self.freq_bias_o, self.frequency_kernel_W_z,self.freq_bias_z, amplitude],initializer=tf.zeros(tf.shape(z_))) # get last summation state for final sum h = h[-1] # make new omega and h matrices to fit size of other stacked matrices omega = tf.stack([omega for _ in range(self.state_size[0])], axis=1) h = tf.stack([h for _ in range(self.state_size[0])], axis=1) return h, [h, img_s, real_s, omega]
def call(self, inputs, **kwargs): """ Applies the layer. Args: inputs (list): list of inputs with 2 items: node features (matrix of size N x F), and graph adjacency matrix (size N x N), where N is the number of nodes in the graph, F is the dimensionality of node features """ X = inputs[0] # Node features (N x F) A = inputs[1] # Adjacency matrix (N x N) # Convert A to dense tensor - needed for the mask to work # TODO: replace this dense implementation of GraphAttention layer with a sparse implementation if K.is_sparse(A): A = tf.sparse_tensor_to_dense(A, validate_indices=False) # For the GAT model to match that in the paper, we need to ensure that the graph has self-loops, # since the neighbourhood of node i in eq. (4) includes node i itself. # Adding self-loops to A via setting the diagonal elements of A to 1.0: if kwargs.get("add_self_loops", False): # get the number of nodes from inputs[1] directly N = K.int_shape(inputs[1])[-1] if N is not None: # create self-loops A = tf.linalg.set_diag(A, K.cast(np.ones((N, )), dtype="float")) else: raise ValueError( "{}: need to know number of nodes to add self-loops; obtained None instead" .format(type(self).__name__)) outputs = [] for head in range(self.attn_heads): kernel = self.kernels[head] # W in the paper (F x F') attention_kernel = self.attn_kernels[ head] # Attention kernel a in the paper (2F' x 1) # Compute inputs to attention network features = K.dot(X, kernel) # (N x F') # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] attn_for_self = K.dot( features, attention_kernel[0]) # (N x 1), [a_1]^T [Wh_i] attn_for_neighs = K.dot( features, attention_kernel[1]) # (N x 1), [a_2]^T [Wh_j] # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] dense = attn_for_self + K.transpose( attn_for_neighs) # (N x N) via broadcasting # Add nonlinearity dense = LeakyReLU(alpha=0.2)(dense) # Mask values before activation (Vaswani et al., 2017) # YT: this only works for 'binary' A, not for 'weighted' A! # YT: if A does not have self-loops, the node itself will be masked, so A should have self-loops # YT: this is ensured by setting the diagonal elements of A tensor to 1 above mask = -10e9 * (1.0 - A) dense += mask # Apply softmax to get attention coefficients dense = K.softmax(dense) # (N x N), Eq. 3 of the paper # Apply dropout to features and attention coefficients dropout_feat = Dropout(self.in_dropout_rate)(features) # (N x F') dropout_attn = Dropout(self.attn_dropout_rate)(dense) # (N x N) # Linear combination with neighbors' features [YT: see Eq. 4] node_features = K.dot(dropout_attn, dropout_feat) # (N x F') if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads' output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs) # (N x KF') else: output = K.mean(K.stack(outputs), axis=0) # N x F') output = self.activation(output) return output
def call(self, inputs): x = inputs W = self.kernel #print("x:", x.shape) #print("W:", W.shape) weights_norm = tf.norm(W, axis=0, keepdims=True) weights = tf.div(W, weights_norm, name="normalize_weights") logits = tf.matmul(x, weights) #print("self.factormachine:", self.factormachine.shape) #Factor machine #factors = K.dot(self.factormachine, x ) #print("self.batch_size: ", self.batch_size) #print("self.feature_size: ", self.feature_size) K.batch_dot() features = [] for ii in range(self.batch_size): xi = x[ii] wi = self.factormachine[ii] feature = tf.multiply(wi, xi) feature = K.transpose(feature) #print("feature: ", feature.shape) features.append(feature) feature_machine = [] for ii in range(self.batch_size): #sum_a_keepdims = K.sum(a , axis=-1 , keepdims=True) #K.sum() sum = K.sum(features[ii], axis=0, keepdims=False) #print("sum: ", sum) diffs = [] #print("K.shape(features[ii]): ", K.shape(features[ii])) for jj in range(self.feature_size): diff = tf.subtract( sum, features[ii] [jj]) #Subtract()([sum, x]) for x in features[ii]] diffs.append(diff) #print("diffs: ", diffs) dots = [] for jj in range(self.feature_size): #dots = [Dot(axes=1)([d, x]) for d, x in zip(diffs, features[ii])] dot = tf.multiply( diffs[jj], features[ii][jj]) #K.dot(diffs[jj], features[ii][jj]) dots.append(dot) sum = K.sum(dots, axis=0, keepdims=False) feature_machine.append(sum) # print("dots: ", dots) # print("dots: ", dots[0].shape) if self.use_bias: output = K.bias_add(logits, self.bias, data_format='channels_last') output = output + feature_machine if self.activation is not None: output = self.activation(output) return output
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask(K.ones_like(inputs), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( K.ones_like(states[0]), self.recurrent_dropout, training=training, count=4) #generating hidden layer dropout masks if (0 < self.hidden_dropout < 1 and self._hidden_dropout_mask is None): self._hidden_dropout_mask = _generate_dropout_mask( K.ones((self.hidden_units, )), self.dropout, training=training, count=2) # print('kernel i shape'); # print(self.kernel_i.shape) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for hidden units hidden_dp_mask = self._hidden_dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[ 0] # previous memory state (should have dimension output_size) c_tm1 = states[1] # previous carry state m_tm1 = states[2] # previous intermediate state # print('h shape') # print(h_tm1.shape) # print('c shape') # print(c_tm1.shape) if self.implementation == 1: if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs inputs_f = inputs inputs_c = inputs inputs_o = inputs x_i = K.dot(inputs_i, self.kernel_i) x_f = K.dot(inputs_f, self.kernel_f) x_c = K.dot(inputs_c, self.kernel_c) x_o = K.dot(inputs_o, self.kernel_o) #print('x_i shape'); #print(x_i.shape) if self.use_bias: x_i = K.bias_add(x_i, self.bias_i) x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) x_o = K.bias_add(x_o, self.bias_o) if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 #intermediate recurrent inputs m_tm1_i = m_tm1 m_tm1_f = m_tm1 m_tm1_c = m_tm1 m_tm1_o = m_tm1 i = self.recurrent_activation( x_i + K.dot(h_tm1_i, self.recurrent_kernel_i) + K.dot(m_tm1_i, self.intermediate_kernel_i)) f = self.recurrent_activation( x_f + K.dot(h_tm1_f, self.recurrent_kernel_f) + K.dot(m_tm1_f, self.intermediate_kernel_f)) c = f * c_tm1 + i * self.activation( x_c + K.dot(h_tm1_c, self.recurrent_kernel_c) + K.dot(m_tm1_c, self.intermediate_kernel_c)) o = self.recurrent_activation( x_o + K.dot(h_tm1_o, self.recurrent_kernel_o) + K.dot(m_tm1_o, self.intermediate_kernel_o)) else: if 0. < self.dropout < 1.: inputs *= dp_mask[0] z = K.dot(inputs, self.kernel) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] z += K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.input_units] z1 = z[:, self.input_units:2 * self.input_units] z2 = z[:, 2 * self.input_units:3 * self.input_units] z3 = z[:, 3 * self.input_units:] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) #h = o * self.activation(c) + h_tm1 m = o * self.activation(c) #hidden layer 1 x_hidden1 = K.dot(m, self.kernel_hidden1) x_hidden1 = self.rectifier_activation( K.bias_add(x_hidden1, self.bias_hidden1)) #dropout implementation of hidden layer 1 if 0 < self.hidden_dropout < 1.: x_hidden1 = x_hidden1 * hidden_dp_mask[0] #hidden layer 2 x_hidden2 = K.dot(x_hidden1, self.kernel_hidden2) x_hidden2 = self.rectifier_activation( K.bias_add(x_hidden2, self.bias_hidden2)) #dropout implementaion of hidden layer 2 if 0 < self.hidden_dropout < 1.: x_hidden2 = x_hidden2 * hidden_dp_mask[1] #rectified dense layer x_r = K.dot(x_hidden2, self.kernel_r) x_r = K.bias_add(x_r, self.bias_r) r = self.rectifier_activation(x_r) h = r + h_tm1 if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h, c, m]
def call(self, inputs, states, training=None): # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state if self.implementation == 1: if 0 < self.dropout < 1.: inputs_0 = inputs * dp_mask[0] inputs_1 = inputs * dp_mask[1] inputs_2 = inputs * dp_mask[2] inputs_3 = inputs * dp_mask[3] inputs_4 = inputs * dp_mask[4] inputs_5 = inputs * dp_mask[5] inputs_6 = inputs * dp_mask[6] inputs_7 = inputs * dp_mask[7] else: inputs_0 = inputs inputs_1 = inputs inputs_2 = inputs inputs_3 = inputs inputs_4 = inputs inputs_5 = inputs inputs_6 = inputs inputs_7 = inputs x_0 = K.dot(inputs_0, self.kernel_0) x_1 = K.dot(inputs_1, self.kernel_1) x_2 = K.dot(inputs_2, self.kernel_2) x_3 = K.dot(inputs_3, self.kernel_3) x_4 = K.dot(inputs_4, self.kernel_4) x_5 = K.dot(inputs_5, self.kernel_5) x_6 = K.dot(inputs_6, self.kernel_6) x_7 = K.dot(inputs_7, self.kernel_7) if self.use_bias: x_0 = K.bias_add(x_0, self.bias_0) x_1 = K.bias_add(x_1, self.bias_1) x_2 = K.bias_add(x_2, self.bias_2) x_3 = K.bias_add(x_3, self.bias_3) x_4 = K.bias_add(x_4, self.bias_4) x_5 = K.bias_add(x_5, self.bias_5) x_6 = K.bias_add(x_6, self.bias_6) x_7 = K.bias_add(x_7, self.bias_7) if 0 < self.recurrent_dropout < 1.: h_tm1_0 = h_tm1 * rec_dp_mask[0] h_tm1_1 = h_tm1 * rec_dp_mask[1] h_tm1_2 = h_tm1 * rec_dp_mask[2] h_tm1_3 = h_tm1 * rec_dp_mask[3] h_tm1_4 = h_tm1 * rec_dp_mask[4] h_tm1_5 = h_tm1 * rec_dp_mask[5] h_tm1_6 = h_tm1 * rec_dp_mask[6] h_tm1_7 = h_tm1 * rec_dp_mask[7] else: h_tm1_0 = h_tm1 h_tm1_1 = h_tm1 h_tm1_2 = h_tm1 h_tm1_3 = h_tm1 h_tm1_4 = h_tm1 h_tm1_5 = h_tm1 h_tm1_6 = h_tm1 h_tm1_7 = h_tm1 # First Layer layer1_0 = self.recurrent_activation( x_0 + K.dot(h_tm1_0, self.recurrent_kernel_0)) layer1_1 = self.cell_activation( x_1 + K.dot(h_tm1_1, self.recurrent_kernel_1)) layer1_2 = self.recurrent_activation( x_2 + K.dot(h_tm1_2, self.recurrent_kernel_2)) layer1_3 = self.cell_activation( x_3 * K.dot(h_tm1_3, self.recurrent_kernel_3)) layer1_4 = self.activation(x_4 + K.dot(h_tm1_4, self.recurrent_kernel_4)) layer1_5 = self.recurrent_activation( x_5 + K.dot(h_tm1_5, self.recurrent_kernel_5)) layer1_6 = self.activation(x_6 + K.dot(h_tm1_6, self.recurrent_kernel_6)) layer1_7 = self.recurrent_activation( x_7 + K.dot(h_tm1_7, self.recurrent_kernel_7)) # Second Layer layer2_0 = self.activation(layer1_0 * layer1_1) layer2_1 = self.activation(layer1_2 + layer1_3) layer2_2 = self.activation(layer1_4 * layer1_5) layer2_3 = self.recurrent_activation(layer1_6 + layer1_7) # Inject the Cell layer2_0 = self.activation(layer2_0 + c_tm1) # Third Layer layer3_0_pre = layer2_0 * layer2_1 c = layer3_0_pre # create a new cell layer3_0 = layer3_0_pre layer3_1 = self.activation(layer2_2 + layer2_3) # Final Layer h = self.activation(layer3_0 * layer3_1) if self.projection_units is not None: h = self.projection_activation(K.dot(h, self.projection_kernel)) else: if 0. < self.dropout < 1.: inputs *= dp_mask[0] z = K.dot(inputs, self.kernel) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] zr = K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: zr = K.bias_add(zr, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units:2 * self.units] z2 = z[:, 2 * self.units:3 * self.units] z3 = z[:, 3 * self.units:4 * self.units] z4 = z[:, 4 * self.units:5 * self.units] z5 = z[:, 5 * self.units:6 * self.units] z6 = z[:, 6 * self.units:7 * self.units] z7 = z[:, 7 * self.units:] zr0 = zr[:, :self.units] zr1 = zr[:, self.units:2 * self.units] zr2 = zr[:, 2 * self.units:3 * self.units] zr3 = zr[:, 3 * self.units:4 * self.units] zr4 = zr[:, 4 * self.units:5 * self.units] zr5 = zr[:, 5 * self.units:6 * self.units] zr6 = zr[:, 6 * self.units:7 * self.units] zr7 = zr[:, 7 * self.units:] # First Layer layer1_0 = self.recurrent_activation(z0 + zr0) layer1_1 = self.cell_activation(z1 + zr1) layer1_2 = self.recurrent_activation(z2 + zr2) layer1_3 = self.cell_activation(z3 * zr3) layer1_4 = self.activation(z4 + zr4) layer1_5 = self.recurrent_activation(z5 + zr5) layer1_6 = self.activation(z6 + zr6) layer1_7 = self.recurrent_activation(z7 + zr7) # Second Layer layer2_0 = self.activation(layer1_0 * layer1_1) layer2_1 = self.activation(layer1_2 + layer1_3) layer2_2 = self.activation(layer1_4 * layer1_5) layer2_3 = self.recurrent_activation(layer1_6 + layer1_7) # Inject the Cell layer2_0 = self.activation(layer2_0 + c_tm1) # Third Layer layer3_0_pre = layer2_0 * layer2_1 c = layer3_0_pre layer3_0 = layer3_0_pre layer3_1 = self.activation(layer2_2 + layer2_3) # Final Layer h = self.activation(layer3_0 * layer3_1) if self.projection_units is not None: h = self.projection_activation(K.dot(h, self.projection_kernel)) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h, c]
def step(self, inputs, states): """ Step function called to compute the next state of the network This step function is equal to the regular GRU step function, except that the input :param x: :param states: :return: """ h_tm1 = states[0] # previous memory dp_mask = states[1] # dropout matrices for recurrent units rec_dp_mask = states[2] if self.implementation == 2: matrix_x = K.dot(inputs * dp_mask[0], self.kernel) if self.use_bias: matrix_x = K.bias_add(matrix_x, self.bias) matrix_inner = K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel[:, :2 * self.units]) x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units:2 * self.units] recurrent_z = matrix_inner[:, :self.units] recurrent_r = matrix_inner[:, self.units:2 * self.units] z = self.recurrent_activation(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) x_h = matrix_x[:, 2 * self.units:] recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0], self.recurrent_kernel[:, 2 * self.units:]) hh = self.activation(x_h + recurrent_h) else: if self.implementation == 0: x_z = inputs[:, :self.units] x_r = inputs[:, self.units:2 * self.units] x_h = inputs[:, 2 * self.units:] elif self.implementation == 1: x_z = K.dot(inputs * dp_mask[0], self.kernel_z) x_r = K.dot(inputs * dp_mask[1], self.kernel_r) x_h = K.dot(inputs * dp_mask[2], self.kernel_h) if self.use_bias: x_z = K.bias_add(x_z, self.bias_z) x_r = K.bias_add(x_r, self.bias_r) x_h = K.bias_add(x_h, self.bias_h) else: raise ValueError('Unknown `implementation` mode.') z = self.recurrent_activation( x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation( x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation( x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h)) h = z * h_tm1 + (1 - z) * hh if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True # concatenate hidden layer activation and gate values all = K.concatenate([h, z, r]) return all, [h]
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask(_generate_dropout_ones( inputs, K.shape(inputs)[-1]), self.dropout, training=training, count=2) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, self.units), self.recurrent_dropout, training=training, count=2) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state if self.implementation == 1: if 0 < self.dropout < 1.: inputs_f = inputs * dp_mask[0] inputs_c = inputs * dp_mask[1] else: inputs_f = inputs inputs_c = inputs x_f = K.dot(inputs_f, self.kernel_f) x_c = K.dot(inputs_c, self.kernel_c) if self.use_bias: x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) if 0 < self.recurrent_dropout < 1.: h_tm1_f = h_tm1 * rec_dp_mask[0] h_tm1_c = h_tm1 * rec_dp_mask[1] else: h_tm1_f = h_tm1 h_tm1_c = h_tm1 f = self.recurrent_activation( x_f + K.dot(h_tm1_f, self.recurrent_kernel_f)) c = f * c_tm1 + (1. - f) * self.activation( x_c + K.dot(h_tm1_c, self.recurrent_kernel_c)) else: if 0. < self.dropout < 1.: inputs *= dp_mask[0] z = K.dot(inputs, self.kernel) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] z += K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units:2 * self.units] f = self.recurrent_activation(z0) c = f * c_tm1 + (1. - f) * self.activation(z1) h = c if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h, c]
def call(self, ins): u = ins[0] img = ins[1] ### g learned by net g1 = K.conv2d(self.img, self.kernel_1, padding='same') g1 = K.bias_add(g1, self.bias_1) g1 = K.relu(g1) g1 = K.conv2d(g1, self.kernel_2, padding='same') g1 = K.bias_add(g1, self.bias_2) g1 = K.relu(g1) g2 = K.pool2d(g1, (2,2), padding='same') g2 = K.conv2d(g2, self.kernel_3, padding='same') g2 = K.bias_add(g2, self.bias_3) g2 = K.relu(g2) g2 = K.conv2d(g2, self.kernel_4, padding='same') g2 = K.bias_add(g2, self.bias_4) g2 = K.relu(g2) g3 = K.pool2d(g2, (2,2), padding='same') g3 = K.conv2d(g3, self.kernel_5, padding='same') g3 = K.bias_add(g3, self.bias_5) g3 = K.relu(g3) g3 = K.conv2d(g3, self.kernel_6, padding='same') g3 = K.bias_add(g3, self.bias_6) g3 = K.relu(g3) g3 = K.conv2d(g3, self.kernel_7, padding='same') g3 = K.bias_add(g3, self.bias_7) g3 = K.relu(g3) g3 = K.conv2d(g3, self.kernel_8, padding='same') g3 = K.bias_add(g3, self.bias_8) g3 = K.relu(g3) g4 = K.resize_images(g3, 2, 2, data_format='channels_last', interpolation='bilinear') g4 = K.concatenate([g2, g4]) g4 = K.conv2d(g4, self.kernel_9, padding='same') g4 = K.bias_add(g4, self.bias_9) g4 = K.relu(g4) g4 = K.conv2d(g4, self.kernel_10, padding='same') g4 = K.bias_add(g4, self.bias_10) g4 = K.relu(g4) g5 = K.resize_images(g4, 2, 2, data_format='channels_last', interpolation='bilinear') g5 = K.concatenate([g1, g5]) g5 = K.conv2d(g5, self.kernel_11, padding='same') g5 = K.bias_add(g5, self.bias_11) g5 = K.relu(g5) g5 = K.conv2d(g5, self.kernel_12, padding='same') g5 = K.bias_add(g5, self.bias_12) g = K.sigmoid(g5) ### grad(g) g_x = K.conv2d(g, kXC, padding='same') g_x = scale(g_x, self.shp, self.rhp, self.dx) g_y = K.conv2d(g, kYC, padding='same') g_y = scale(g_y, self.shp, self.rhp, self.dy) ### transport - upwind xp = K.conv2d(u, xKP, padding='same') xn = K.conv2d(u, xKN, padding='same') yp = K.conv2d(u, xYP, padding='same') yn = K.conv2d(u, xYN, padding='same') fxp = K.relu( g_x) fxn = -1.0 * K.relu( -1.0 * g_x) fyp = K.relu( g_y) fyn = -1.0 * K.relu( -1.0 * g_y) xpp = fxp*xp xnn = fxn*xn ypp = fyp*yp ynn = fyn*yn xterms = xpp + xnn xterms = scale(xterms, self.shp, self.rhp, self.dx) yterms = ypp + ynn yterms = scale(yterms, self.shp, self.rhp, self.dy) transport = xterms + yterms ### curvature - learned grad_u = K.conv2d(u, self.kernel_k1, padding='same') norm_grad_u = K.sqrt( K.epsilon() + K.sum( K.square(grad_u), axis=-1, keepdims=True) ) grad_u = grad_u / (norm_grad_u + K.epsilon()) kappa = K.conv2d(grad_u, self.kernel_k2, padding='same') curvature = g*kappa*norm_grad_u ### balloon balloon = g*norm_grad_u return u + K.constant(self.dt)*( curvature * self.alpha \ + transport * self.beta \ + balloon * self.gamma )
def call(self, point_cloud): def getDistanceMatrix(x): """ Compute pairwise distance matrix for a point cloud Input: point_cloud: tensor (batch_size, n_points, n_features) Returns: dists: tensor (batch_size, n_points, n_points) pairwise distances """ part1 = -2 * K.batch_dot(x, K.permute_dimensions(x, (0, 2, 1))) part2 = K.permute_dimensions(K.expand_dims(K.sum(x**2, axis=2)), (0, 2, 1)) part3 = K.expand_dims(K.sum(x**2, axis=2)) dists = part1 + part2 + part3 return dists def getKnearest(dists, k): """Get indices of k nearest neighbors from distance tensor Input: dists: (batch_size, n_points, n_points) pairwise distances Returns: knn_idx: (batch_size, n_points, k) nearest neighbor indices """ _, knn_idx = tf.math.top_k(-dists, k=k) return knn_idx def getEdgeFeature(point_cloud, nn_idx): """Construct the input for the edge convolution Input: point_cloud: (batch_size, n_points, n_features) nn_idx: (batch_size, n_points, n_neighbors) Returns: edge_features: (batch_size, n_points, k, n_features*2) """ k = nn_idx.get_shape()[-1] point_cloud_shape = tf.shape(point_cloud) batch_size = point_cloud_shape[0] n_points = point_cloud_shape[1] n_features = point_cloud_shape[2] # Prepare indices to match neighbors in flattened cloud idx = K.arange(0, stop=batch_size, step=1) * n_points idx = K.reshape(idx, [-1, 1, 1]) # Flatten cloud and gather neighbors flat_cloud = K.reshape(point_cloud, [-1, n_features]) neighbors = K.gather(flat_cloud, nn_idx + idx) # Expand centers to (batch_size, n_points, k, n_features) cloud_centers = K.expand_dims(point_cloud, axis=-2) cloud_centers = K.tile(cloud_centers, [1, 1, k, 1]) edge_features = K.concatenate( [cloud_centers, neighbors - cloud_centers], axis=-1) return edge_features def batch_norm(inputs, gamma, beta, dims, ind): """ Normalize batch and update moving averages for mean and std Input: inputs: (batchsize, n_points, k, n_features * 2) - edge_features gamma: weight - gamma for batch normalization beta: weight - beta for batch normalization dims: list - dimensions along which to normalize ind: int - indicating which weights to use Returns: During training: normed: (batchsize, n_points, k, n_features * 2) - normalized batch of data using actual batch for normalization Else: normed_moving: same, but using the updated average values """ # Calculate normalized data, mean and std for batch normed, batch_mean, batch_var = K.normalize_batch_in_training( x=inputs, gamma=gamma, beta=beta, reduction_axes=dims) # Update the moving averages self.add_update([ K.moving_average_update(self.moving_mean[ind], batch_mean, 0.9), K.moving_average_update(self.moving_var[ind], batch_var, 0.9) ]) # Calculate normalization using the averages normed_moving = K.batch_normalization(x=inputs, mean=self.moving_mean[ind], var=self.moving_var[ind], beta=beta, gamma=gamma) # If training return normed, else normed_moving return K.in_train_phase(normed, normed_moving) if self.n_ind: # get dinstances according to given indices dists = getDistanceMatrix( point_cloud[:, :, slice(self.n_ind[0], self.n_ind[1])]) else: # get distances according to full feature vector dists = getDistanceMatrix(point_cloud) knn_idx = getKnearest(dists, self.k) edge_features = getEdgeFeature(point_cloud, knn_idx) # Create first convolutional block output = K.conv2d(edge_features, self.kernel[0], (1, 1), padding='same') output = K.bias_add(output, self.bias[0]) output = batch_norm(output, self.gamma[0], self.beta[0], [0, 1, 2], 0) output = K.relu(output) # Additional convolutional blocks for i in range(1, len(self.n_channel_out)): output = K.conv2d(output, self.kernel[i], (1, 1), padding='same') output = K.bias_add(output, self.bias[i]) output = batch_norm(output, self.gamma[i], self.beta[i], [0, 1, 2], i) output = K.relu(output) output = K.mean(output, axis=-2) return output
def attention(self, pre_q, pre_v, pre_k, out_seq_len: int, d_model: int, training=None): """ Calculates the output of the attention once the affine transformations of the inputs are done. Here's the shapes of the arguments: :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads) :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads) :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads) :param out_seq_len: the length of the output sequence :param d_model: dimensionality of the model (by the paper) :param training: Passed by Keras. Should not be defined manually. Optional scalar tensor indicating if we're in training or inference phase. """ # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads) q = K.permute_dimensions(pre_q, [0, 2, 1, 3]) v = K.permute_dimensions(pre_v, [0, 2, 1, 3]) if self.compression_window_size is None: k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1]) else: # Memory-compressed attention described in paper # "Generating Wikipedia by Summarizing Long Sequences" # (https://arxiv.org/pdf/1801.10198.pdf) # It compresses keys and values using 1D-convolution which reduces # the size of Q * K_transposed from roughly seq_len^2 # to convoluted_seq_len^2. If we use strided convolution with # window size = 3 and stride = 3, memory requirements of such # memory-compressed attention will be 9 times smaller than # that of the original version. if self.use_masking: raise NotImplementedError( "Masked memory-compressed attention has not " "been implemented yet") k = K.permute_dimensions(pre_k, [0, 2, 1, 3]) k, v = [ K.reshape( # Step 3: Return the result to its original dimensions # (batch_size, num_heads, seq_len, d_model//heads) K.bias_add( # Step 3: ... and add bias K.conv1d( # Step 2: we "compress" K and V using strided conv K.reshape( # Step 1: we reshape K and V to # (batch + num_heads, seq_len, d_model//heads) item, (-1, K.int_shape(item)[-2], d_model // self.num_heads)), kernel, strides=self.compression_window_size, padding='valid', data_format='channels_last'), bias, data_format='channels_last'), # new shape K.concatenate( [K.shape(item)[:2], [-1, d_model // self.num_heads]])) for item, kernel, bias in ((k, self.k_conv_kernel, self.k_conv_bias), (v, self.v_conv_kernel, self.v_conv_bias)) ] k_transposed = K.permute_dimensions(k, [0, 1, 3, 2]) # shaping K into (batch_size, num_heads, d_model//heads, seq_len) # for further matrix multiplication sqrt_d = K.constant(np.sqrt(d_model // self.num_heads), dtype=K.floatx()) q_shape = K.int_shape(q) k_t_shape = K.int_shape(k_transposed) v_shape = K.int_shape(v) # before performing batch_dot all tensors are being converted to 3D # shape (batch_size * num_heads, rows, cols) to make sure batch_dot # performs identically on all backends attention_heads = K.reshape( K.batch_dot( self.apply_dropout_if_needed(K.softmax( self.mask_attention_if_needed( K.batch_dot( K.reshape(q, (-1, ) + q_shape[-2:]), K.reshape(k_transposed, (-1, ) + k_t_shape[-2:])) / sqrt_d)), training=training), K.reshape(v, (-1, ) + v_shape[-2:])), (-1, self.num_heads, q_shape[-2], v_shape[-1])) attention_heads_merged = K.reshape( K.permute_dimensions(attention_heads, [0, 2, 1, 3]), (-1, d_model)) attention_out = K.reshape( K.dot(attention_heads_merged, self.output_weights), (-1, out_seq_len, d_model)) return attention_out
def call(self, inputs): def _l2normalize(v, eps=1e-12): return v / (K.sum(v ** 2) ** 0.5 + eps) def power_iteration(W, u): _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v if self.spectral_normalization: W_shape = self.kernel.shape.as_list() # Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) # Calculate Sigma sigma = K.dot(_v, W_reshaped) sigma = K.dot(sigma, K.transpose(_u)) # normalize it W_bar = W_reshaped / sigma # reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) # update weitht self.kernel = W_bar if self.rank == 1: outputs = K.conv1d( inputs, self.kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d( inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d( inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, inputs, states, training=None): dp_mask = self._dropout_mask rec_dp_mask = self._recurrent_dropout_mask cont_dp_mask = self._controller_dropout_mask h_tm1 = states[0] c_tm1 = states[1] r_tm1 = states[2] if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs_f = inputs_c = inputs_o = inputs x_i = K.dot(inputs_i, self.kernel_i) x_f = K.dot(inputs_f, self.kernel_f) x_c = K.dot(inputs_c, self.kernel_c) x_o = K.dot(inputs_o, self.kernel_o) if self.use_bias: x_i = K.bias_add(x_i, self.bias_i) x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) x_o = K.bias_add(x_o, self.bias_o) if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1_f = h_tm1_c = h_tm1_o = h_tm1 h_tm1_i = K.dot(h_tm1_i, self.recurrent_kernel_i) h_tm1_f = K.dot(h_tm1_f, self.recurrent_kernel_f) h_tm1_c = K.dot(h_tm1_c, self.recurrent_kernel_c) h_tm1_o = K.dot(h_tm1_o, self.recurrent_kernel_o) #memories are fed back as input next cycle r_tm1_i = K.dot(r_tm1, self.recurrent_kernel_r) i = self.recurrent_activation(x_i + h_tm1_i + r_tm1_i) f = self.recurrent_activation(x_f + h_tm1_f) c = f * c_tm1 + i * self.activation(x_c + h_tm1_c) o = self.recurrent_activation(x_o + h_tm1_o) h = o * self.activation(c) if 0 < self.controller_dropout < 1.: controller_r = h * cont_dp_mask[0] else: controller_r = h #calculate the write weights self.controller_ww = K.sigmoid(self.write_gate) * self.controller_wr + \ (1 - K.sigmoid(self.write_gate)) * self.controller_wlu #calculate read weights and retrieve the appropriate memory n_controller_r = K.l2_normalize(controller_r, 1) n_memory = K.l2_normalize(self.memory, 1) t_n_memory = K.transpose(n_memory) mem_cos_similarity = K.dot(n_controller_r, t_n_memory) self.controller_wr = K.softmax(mem_cos_similarity) r = K.dot(self.controller_wr, self.memory) self.reads += 1 #calculate the usage weights self.controller_wu = self.usage_decay * self.controller_wu + \ self.controller_wr + self.controller_ww #calculate the least used weights v, i = tf.nn.top_k(self.controller_wu, self.controller_wu.shape[1]) n = min(self.reads, self.memory.shape[1]) nth_smallest = K.reshape(v[:, -n], (self.batch_size, 1)) smallest_index = tf.reduce_min(i[:, -1]) nth_smallest = tf.matmul( nth_smallest, tf.constant(1., shape=(1, self.memory.shape[0]))) lt = tf.less_equal(self.controller_wu, nth_smallest) self.controller_wlu = tf.cast(lt, tf.float32) #zero the least used memory location #note this is not correct right notw, smallest index is the smallest #index of the vector of indicies of smallest values over the batch, #not the index of the smallest value over the batch zero_array = tf.constant([[1.] if i != smallest_index else [0.] for i in range(self.memory.shape[0])]) ones_array = tf.ones((1, self.units)) self.memory = tf.matmul(zero_array, ones_array) * self.memory #update the memory self.memory = tf.matmul(tf.transpose(self.controller_ww), h) + self.memory if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return r, [h, c, r]
def call(self, inputs): input_shape = K.shape(inputs) batch_size = input_shape[0] if self.data_format == 'channels_first': h_axis, w_axis = 2, 3 else: h_axis, w_axis = 1, 2 height, width = input_shape[h_axis], input_shape[w_axis] kernel_h, kernel_w = self.kernel_size stride_h, stride_w = self.strides if self.output_padding is None: out_pad_h = out_pad_w = None else: out_pad_h, out_pad_w = self.output_padding # Infer the dynamic output shape: out_height = conv_utils.deconv_length(height, stride_h, kernel_h, self.padding, out_pad_h) out_width = conv_utils.deconv_length(width, stride_w, kernel_w, self.padding, out_pad_w) if self.data_format == 'channels_first': output_shape = (batch_size, self.filters, out_height, out_width) else: output_shape = (batch_size, out_height, out_width, self.filters) # Spectral Normalization def _l2normalize(v, eps=1e-12): return v / (K.sum(v ** 2) ** 0.5 + eps) def power_iteration(W, u): # Accroding the paper, we only need to do power iteration one time. _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v W_shape = self.kernel.shape.as_list() # Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) # Calculate Sigma sigma = K.dot(_v, W_reshaped) sigma = K.dot(sigma, K.transpose(_u)) # normalize it W_bar = W_reshaped / sigma # reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) self.kernel = W_bar outputs = K.conv2d_transpose( inputs, self.kernel, output_shape, self.strides, padding=self.padding, data_format=self.data_format) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, x): output0 = K.dot(x[0], self.kernel) output1 = K.batch_dot(output0, x[1]) output2 = K.bias_add(output1, self.bias) output3 = self.activation(output2) return output3
def step(self, inputs, states, training=None): """Computes the output of a single step. Unlike the vanilla GRU, attention is applied to the output, as per https://arxiv.org/pdf/1603.01417.pdf ---------- inputs : (K.Tensor) A tensor of shape [batch_size, input_size+1]. The last element of each example is the attention score. states : (K.Tensor) Initial (list) of states training : (bool) Whether the network is in training mode or not. Returns ------- (K.Tensor) The output for the current step, modified by attention """ # Needs question as an input x_i, attn_gate = array_ops.split(inputs, num_or_size_splits=[self.units, 1], axis=1) h_tm1 = states[0] # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask if self.implementation == 1: if 0. < self.dropout < 1.: inputs_z = x_i * dp_mask[0] inputs_r = x_i * dp_mask[1] inputs_h = x_i * dp_mask[2] else: inputs_z = x_i inputs_r = x_i inputs_h = x_i x_z = K.dot(inputs_z, self.kernel_z) x_r = K.dot(inputs_r, self.kernel_r) x_h = K.dot(inputs_h, self.kernel_h) if self.use_bias: x_z = K.bias_add(x_z, self.bias_z) x_r = K.bias_add(x_r, self.bias_r) x_h = K.bias_add(x_h, self.bias_h) if 0. < self.recurrent_dropout < 1.: h_tm1_z = h_tm1 * rec_dp_mask[0] h_tm1_r = h_tm1 * rec_dp_mask[1] h_tm1_h = h_tm1 * rec_dp_mask[2] else: h_tm1_z = h_tm1 h_tm1_r = h_tm1 h_tm1_h = h_tm1 z = self.recurrent_activation( x_z + K.dot(h_tm1_z, self.recurrent_kernel_z)) r = self.recurrent_activation( x_r + K.dot(h_tm1_r, self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1_h, self.recurrent_kernel_h)) else: if 0. < self.dropout < 1.: x_i *= dp_mask[0] matrix_x = K.dot(x_i, self.kernel) if self.use_bias: matrix_x = K.bias_add(matrix_x, self.bias) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units]) x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units:2 * self.units] recurrent_z = matrix_inner[:, :self.units] recurrent_r = matrix_inner[:, self.units:2 * self.units] z = self.recurrent_activation(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) x_h = matrix_x[:, 2 * self.units:] recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:]) hh = self.activation(x_h + recurrent_h) h = z * h_tm1 + (1 - z) * hh # Attention modulated output. h = attn_gate * h + (1 - attn_gate) * h_tm1 if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h]
def call(self, x): return K.bias_add(K.dot(x[1], self.kernel) * x[0] + x[1], self.bias)
def step(self, inputs, states): h_tm1 = states[0] # not used c_tm1 = states[1] dp_mask = states[2] rec_dp_mask = states[3] if self.implementation == 2: z = K.dot(inputs * dp_mask[0], self.kernel) z = z * rec_dp_mask[0] z0 = z[:, :self.units] if self.use_bias: z_bias = K.bias_add(z[:, self.units: self.units * 3], self.bias) z_bias = self.recurrent_activation(z_bias) z1 = z_bias[:, :self.units] z2 = z_bias[:, self.units: 2 * self.units] else: z1 = z[:, self.units: 2 * self.units] z2 = z[:, 2 * self.units: 3 * self.units] if self.kernel_dim == 4: z3 = z[:, 3 * self.units: 4 * self.units] else: z3 = None f = z1 r = z2 c = f * c_tm1 + (1 - f) * z0 if self.kernel_dim == 4: h = r * self.activation(c) + (1 - r) * z3 else: h = r * self.activation(c) + (1 - r) * inputs else: if self.implementation == 0: x_w = inputs[:, :self.units] x_f = inputs[:, self.units: 2 * self.units] x_r = inputs[:, 2 * self.units: 3 * self.units] if self.kernel_dim == 4: x_w_x = inputs[:, 3 * self.units: 4 * self.units] else: x_w_x = None elif self.implementation == 1: x_w = K.dot(inputs * dp_mask[0], self.kernel_w) x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f x_r = K.dot(inputs * dp_mask[2], self.kernel_r) + self.bias_r x_f = self.recurrent_activation(x_f) x_r = self.recurrent_activation(x_r) if self.kernel_dim == 4: x_w_x = K.dot(inputs * dp_mask[0], self.kernel_p) else: x_w_x = None else: raise ValueError('Unknown `implementation` mode.') w = x_w * rec_dp_mask[0] f = x_f r = x_r c = f * c_tm1 + (1 - f) * w if self.kernel_dim == 4: h = r * self.activation(c) + (1 - r) * x_w_x else: h = r * self.activation(c) + (1 - r) * inputs if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True return h, [h, c]
def step(self, inputs, states): h_tm1 = states[0] c_tm1 = states[1] dp_mask = states[2] rec_dp_mask = states[3] if self.implementation == 2: z = K.dot(inputs * dp_mask[0], self.kernel) z += z * K.dot( h_tm1 * rec_dp_mask[0], self.recurrent_kernel) # applies m instead of h_tm1 to z if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units:2 * self.units] z2 = z[:, 2 * self.units:3 * self.units] z3 = z[:, 3 * self.units:4 * self.units] z4 = z[:, 4 * self. units:] # just elementwise multiplication, no activation functions i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) else: if self.implementation == 0: x_i = inputs[:, :self.units] x_f = inputs[:, self.units:2 * self.units] x_c = inputs[:, 2 * self.units:3 * self.units] x_o = inputs[:, 3 * self.units:4 * self.units] x_m = inputs[:, 4 * self.units:] elif self.implementation == 1: x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o x_m = K.dot(inputs * dp_mask[4], self.kernel_m) + self.bias_m else: raise ValueError('Unknown `implementation` mode.') m = x_m * K.dot( h_tm1 * rec_dp_mask[4], self.recurrent_kernel_m) # elementwise multiplication m i = self.recurrent_activation( x_i + K.dot(m * rec_dp_mask[0], self.recurrent_kernel_i)) f = self.recurrent_activation( x_f + K.dot(m * rec_dp_mask[1], self.recurrent_kernel_f)) c = f * c_tm1 + i * self.activation( x_c + K.dot(m * rec_dp_mask[2], self.recurrent_kernel_c)) o = self.recurrent_activation( x_o + K.dot(m * rec_dp_mask[3], self.recurrent_kernel_o)) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True return h, [h, c]
def layer_without_activation(dense): output = K.dot(dense.input, dense.kernel) if dense.use_bias: output = K.bias_add(output, dense.bias, data_format='channels_last') return output
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, K.shape(inputs)[-1]), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, self.units), self.recurrent_dropout, training=training, count=4) if (0 < self.zoneout_c < 1 and self._zoneout_mask_c is None): self._zoneout_mask_c = _generate_dropout_mask( _generate_dropout_ones(inputs, self.units), self.zoneout_c, training=training, count=1) if (0 < self.zoneout_h < 1 and self._zoneout_mask_h is None): self._zoneout_mask_h = _generate_dropout_mask( _generate_dropout_ones(inputs, self.units), self.zoneout_h, training=training, count=1) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state if self.implementation == 1: if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs inputs_f = inputs inputs_c = inputs inputs_o = inputs x_i = K.dot(inputs_i, self.kernel_i) x_f = K.dot(inputs_f, self.kernel_f) x_c = K.dot(inputs_c, self.kernel_c) x_o = K.dot(inputs_o, self.kernel_o) if self.use_bias: x_i = K.bias_add(x_i, self.bias_i) x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) x_o = K.bias_add(x_o, self.bias_o) if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 i = self.recurrent_activation(self.ln(x_i + K.dot(h_tm1_i, self.recurrent_kernel_i))) f = self.recurrent_activation(self.ln(x_f + K.dot(h_tm1_f, self.recurrent_kernel_f))) c = f * c_tm1 + i * self.activation(self.ln(x_c + K.dot(h_tm1_c, self.recurrent_kernel_c))) o = self.recurrent_activation(self.ln(x_o + K.dot(h_tm1_o, self.recurrent_kernel_o))) h = o * self.activation(self.ln(c)) if 0 < self.dropout + self.recurrent_dropout + self.zoneout_c + self.zoneout_h: if training is None: h._uses_learning_phase = True if 0 < self.zoneout_h < 1: h = K.in_train_phase(K.dropout(h - h_tm1, self.zoneout_h), h - h_tm1) h = h * (1. - self.zoneout_h) + h_tm1 if 0 < self.zoneout_c < 1: c = K.in_train_phase(K.dropout(c - c_tm1, self.zoneout_c), c - c_tm1) c = c * (1. - self.zoneout_c) + c_tm1 return h, [h, c]
def call(self, inputs): # y, M = inputs y = inputs if self.sync_mode is None or self.sync_mode is 'radial_sync': if K.ndim(y) == 4: y = K.expand_dims(y, axis=3) y = K.repeat_elements(y, rep=self.ndirs, axis=3) y_ = bilinear_sampler(y, self.x, self.y, self.nrings, self.ndirs) nbatch = K.shape(y)[0] nchannels = K.shape(y)[-1] if not self.pool: # synchronize with sync field y = bilinear_sampler(y, self.e_x, self.e_y, self.nrings, self.ndirs) # prepare circular convolution y = K.reshape(y, (nbatch * self.sz_y * self.sz_x, self.nrings, self.ndirs, nchannels)) # pad it along the dirs axis so that conv2d produces circular # convolution along that dimension # shape = (nbatch, nv, ndirs, nchannel) y = K.concatenate([y, y[:, :, :-1, :]], axis=2) # output is N x outmaps x 1 x nrays if filter size is the same as # input image size prior padding y = K.conv2d(y, self.kernel, strides=(1, 1), padding='valid', data_format='channels_last', dilation_rate=(1, 1)) y = K.reshape(y, (nbatch, self.sz_y, self.sz_x, 1, self.ndirs, self.nfilters)) y = tf.squeeze(y, [3]) # add contribution of central vertex y += K.dot(y_, self.center_kernel) else: y = K.dot(y_, self.center_kernel) if self.use_bias: y = K.bias_add(y, self.bias, data_format=None) #y = y + center if self.activation is not None: y = self.activation(y) if self.take_max: y = K.max(y, axis=2, keepdims=False) elif self.sync_mode is 'async': if K.ndim(y) == 5: y = K.max(y, axis=3, keepdims=False) y_ = bilinear_sampler(y, self.x, self.y, self.nrings, self.ndirs) if not self.pool: nbatch = K.shape(y)[0] nchannels = K.shape(y)[-1] # pull back the input to the fiber product of the tangent bundle by the frame bundle # by the frame transporter y = bilinear_sampler(y, self.e_x, self.e_y, self.nrings, self.ndirs) y = K.reshape(y, (nbatch * self.sz_y * self.sz_x, self.nrings, self.ndirs, nchannels)) # pad it along the dirs axis so that conv2d produces circular # convolution along that dimension # shape = (nbatch, nv, ndirs, nchannel) y = K.concatenate([y, y[:, :, :-1, :]], axis=2) # output is N x outmaps x 1 x nrays if filter size is the same as # input image size prior padding y = K.conv2d(y, self.kernel, strides=(1, 1), padding='valid', data_format='channels_last', dilation_rate=(1, 1)) y = K.reshape(y, (nbatch, self.sz_y, self.sz_x, 1, self.ndirs, self.nfilters)) # y = K.max(y, axis=2, keepdims=False) y = tf.squeeze(y, [3]) # add contribution of central vertex y_ = K.dot(y_, self.center_kernel) y_ = K.expand_dims(y_, axis=3) y_ = K.repeat_elements(y_, rep=self.ndirs, axis=3) y += y_ else: y_ = K.dot(y_, self.center_kernel) y_ = K.expand_dims(y_, axis=3) y_ = K.repeat_elements(y_, rep=self.ndirs, axis=3) y = y_ if self.use_bias: y = K.bias_add(y, self.bias, data_format=None) if self.activation is not None: y = self.activation(y) if self.take_max: y = K.max(y, axis=2, keepdims=False) return y
def step(self, inputs, states): h_tm1 = states[0] # previous memory dp_mask = states[1] # dropout matrices for recurrent units rec_dp_mask = states[2] eye_mask = K.eye(self.num_labels, dtype='float32') if self.implementation == 2: matrix_x = K.dot(inputs * dp_mask[0], self.kernel) if self.use_bias: matrix_x = K.bias_add(matrix_x, self.bias) # Adding the semi-diagonal mask as mentioned in the paper. # This will ensure that all the non-diagonal elements of the weight matrix # corresponding to the labels are set to zero. recurrent_kernel[:self.num_labels, :self.num_labels] = Multiply([ recurrent_kernel[:self.num_labels, :self.num_labels], eye_mask ]) recurrent_kernel[:self.num_labels, self.units:self.units + self.num_labels] = Multiply([ recurrent_kernel[:self.num_labels, self.units:self.units + self.num_labels], eye_mask ]) recurrent_kernel[:self.num_labels, 2 * self.units:2 * self.units + self.num_labels] = Multiply([ recurrent_kernel[:self.num_labels, 2 * self.units:2 * self.units + self.num_labels], eye_mask ]) matrix_inner = K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel[:, :2 * self.units]) x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units:2 * self.units] recurrent_z = matrix_inner[:, :self.units] recurrent_r = matrix_inner[:, self.units:2 * self.units] z = self.recurrent_activation(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) x_h = matrix_x[:, 2 * self.units:] recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0], self.recurrent_kernel[:, 2 * self.units:]) hh = self.activation(x_h + recurrent_h) else: if self.implementation == 0: x_z = inputs[:, :self.units] x_r = inputs[:, self.units:2 * self.units] x_h = inputs[:, 2 * self.units:] elif self.implementation == 1: x_z = K.dot(inputs * dp_mask[0], self.kernel_z) x_r = K.dot(inputs * dp_mask[1], self.kernel_r) x_h = K.dot(inputs * dp_mask[2], self.kernel_h) if self.use_bias: x_z = K.bias_add(x_z, self.bias_z) x_r = K.bias_add(x_r, self.bias_r) x_h = K.bias_add(x_h, self.bias_h) else: raise ValueError('Unknown `implementation` mode.') z = self.recurrent_activation( x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation( x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation( x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h)) h = z * h_tm1 + (1 - z) * hh if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True return h, [h]
def call(self, x): dot = K.dot(x, self.kernel) dot_plus_biais = K.bias_add(dot, self.biais) return dot_plus_biais
def call(self, inputs): ent_emb = inputs[0] rel_emb = inputs[1] adj = tf.SparseTensor( K.cast(K.squeeze(inputs[2], axis=0), dtype="int64"), K.ones_like(inputs[2][0, :, 0]), (self.node_size, self.node_size)) sparse_indices = tf.squeeze(inputs[3], axis=0) sparse_val = tf.squeeze(inputs[4], axis=0) rel_adj = K.cast(K.squeeze(inputs[5], axis=0), dtype="int64") rel_adj = tf.SparseTensor(indices=rel_adj, values=tf.ones_like(rel_adj[:, 0], dtype='float32'), dense_shape=(self.node_size, self.rel_size)) rel_adj = tf.sparse_softmax(rel_adj) rel_features = tf.sparse_tensor_dense_matmul(rel_adj, rel_emb) ent_adj = K.cast(K.squeeze(inputs[6], axis=0), dtype="int64") ent_adj = tf.SparseTensor(indices=ent_adj, values=tf.ones_like(ent_adj[:, 0], dtype='float32'), dense_shape=(self.node_size, self.node_size)) ent_adj = tf.sparse_softmax(ent_adj) ent_features = tf.sparse_tensor_dense_matmul(ent_adj, ent_emb) features = K.concatenate([ent_features, rel_features]) outputs = [self.activation(features)] for _ in range(self.depth): features_list = [] for head in range(self.attn_heads): attention_kernel = self.attn_kernels[head] attn_for_rels = tf.SparseTensor(indices=sparse_indices, values=sparse_val, dense_shape=(self.triple_size, self.rel_size)) attn_for_rels = tf.squeeze(tf.sparse_tensor_dense_matmul( attn_for_rels, K.dot(rel_emb, attention_kernel[2])), axis=-1) attn_for_rels = tf.SparseTensor(indices=adj.indices, values=attn_for_rels, dense_shape=adj.dense_shape) attn_for_self = K.dot(features, attention_kernel[0]) attn_for_neighs = tf.transpose( K.dot(features, attention_kernel[1]), [1, 0]) att = tf.sparse_add( tf.sparse_add(attn_for_rels, adj * attn_for_self), adj * attn_for_neighs) att = tf.SparseTensor(indices=att.indices, values=tf.nn.leaky_relu(att.values), dense_shape=att.dense_shape) att = tf.sparse_softmax(att) new_features = tf.sparse_tensor_dense_matmul(att, features) if self.use_bias: new_features = K.bias_add(new_features, self.biases[head]) features_list.append(new_features) if self.attn_heads_reduction == 'concat': features = K.concatenate(features_list) else: features = K.mean(K.stack(features_list), axis=0) features = self.activation(features) outputs.append(features) outputs = K.concatenate(outputs) return [outputs, att.indices, att.values]
def call(self, inputs): # y, M = inputs y = inputs[0] contributors = inputs[1] weights = inputs[2] angles = inputs[3] if self.sync_mode is None or self.sync_mode is 'radial_sync': if K.ndim(y) == 3: y = K.expand_dims(y, axis=2) y = K.repeat_elements(y, rep=self.ndirs, axis=2) y_ = y nbatch = K.shape(y)[0] nchannels = K.shape(y)[-1] # synchronize with sync field y = window_interpolation_sync(y, contributors, weights, angles) # circular convolution y = gcnn_conv(y, self.kernel, nbatch, self.nv, self.nrings, self.ndirs, self.nfilters, nchannels) # add contribution of central vertex y += K.dot(y_, self.center_kernel) if self.use_bias: y = K.bias_add(y, self.bias, data_format=None) #y = y + center if self.activation is not None: y = self.activation(y) if self.take_max: y = K.max(y, axis=2, keepdims=False) elif self.sync_mode is 'async': if K.ndim(y) == 4: y = K.max(y, axis=2, keepdims=False) y_ = y nbatch = K.shape(y)[0] nchannels = K.shape(y)[-1] # pull back the input to the fiber product of the tangent bundle by the frame bundle # by the frame transporter y = window_interpolation_async(y, contributors, weights) y = gcnn_conv(y, self.kernel, nbatch, self.nv, self.nrings, self.ndirs, self.nfilters, nchannels) # add contribution of central vertex y_ = K.dot(y_, self.center_kernel) y_ = K.expand_dims(y_, axis=2) y_ = K.repeat_elements(y_, rep=self.ndirs, axis=2) y += y_ if self.use_bias: y = K.bias_add(y, self.bias, data_format=None) if self.activation is not None: y = self.activation(y) if self.take_max: y = K.max(y, axis=2, keepdims=False) return y