def _multihead_attention_layer(self, layer_idx, query, memory=None, mask=None): if memory is None: memory = query # Linear project to d_model dimension: [batch, q_size/k_size, d_model] Q = ne.fully_conn(query, self.att_weights["W_att_Q_l{}_0".format(layer_idx)], self.att_biases["b_att_Q_l{}_0".format(layer_idx)]) Q = ne.leaky_relu(Q, self.leaky_ratio[layer_idx]) K = ne.fully_conn(memory, self.att_weights["W_att_K_l{}_0".format(layer_idx)], self.att_biases["b_att_K_l{}_0".format(layer_idx)]) K = ne.leaky_relu(K, self.leaky_ratio[layer_idx]) V = ne.fully_conn(memory, self.att_weights["W_att_V_l{}_0".format(layer_idx)], self.att_biases["b_att_V_l{}_0".format(layer_idx)]) V = ne.leaky_relu(V, self.leaky_ratio[layer_idx]) # Split the matrix to multiple heads and then concatenate to have a larger # batch size: [h*batch, q_size/k_size, d_model/num_heads] Q_split = tf.concat(tf.split(Q, self.num_att_header, axis=2), axis=0) K_split = tf.concat(tf.split(K, self.num_att_header, axis=2), axis=0) V_split = tf.concat(tf.split(V, self.num_att_header, axis=2), axis=0) if mask != None: mask = tf.tile(mask, [self.num_att_header, 1, 1]) # Apply scaled dot product attention d = self.feature_size // self.num_att_header assert d == Q_split.shape[-1] == K_split.shape[-1] == V_split.shape[-1] out = tf.matmul(Q_split, tf.transpose(K_split, [0, 2, 1])) # [h*batch, q_size, k_size] out = out / tf.sqrt(tf.cast(d, tf.float32)) # scaled by sqrt(d_k) if mask is not None: # masking out (0.0) => setting to -inf. out = tf.multiply(out, mask) + (1.0 - mask) * (-1e10) out = ne.softmax(out) # [h * batch, q_size, k_size] out = ne.dropout(out, self.drop_rate[layer_idx], self.is_training) out = tf.matmul(out, V_split) # [h * batch, q_size, d_model] # Merge the multi-head back to the original shape out = tf.concat(tf.split(out, self.num_att_header, axis=0), axis=2) # [bs, q_size, d_model] return out
def in_layer(self, inputs, W_name="W_in_", b_name="b_in_"): net = inputs # fc h, w, c = net.get_shape().as_list()[1:] assert h * w * c == self.in_state net = tf.reshape(net, [-1, self.in_state]) for layer_id in range(len(self.in_fc_states)): weight_name = "{}{}".format(W_name, layer_id) bias_name = "{}{}".format(b_name, layer_id) curr_weight = self.in_weight[weight_name] curr_bias = self.in_bias[bias_name] # batch normalization if self.in_norm == "BATCH": net = ne.batch_norm(net, self.is_training) elif self.in_norm == "LAYER": net = ne.layer_norm(net, self.is_training) #net = ne.leaky_brelu(net, self.conv_leaky_ratio[layer_id], self.layer_low_bound, self.output_up_bound) # Nonlinear act net = ne.leaky_relu(net, self.in_leaky_ratio) net = ne.fully_conn(net, curr_weight, curr_bias) out_channel = self.in_fc_states[-1] // h // w assert h * w * out_channel == self.in_fc_states[-1] net = tf.reshape(net, [-1, h, w, out_channel]) net = tf.identity(net, name='in_output') #import pdb; pdb.set_trace() return net
def out_layer(self, inputs, label=None, W_name="W_out_", b_name="b_out_"): net = inputs h, w, c = net.get_shape().as_list()[1:] assert h*w*c == self.out_state net = tf.reshape(net, [-1, self.out_state]) if self.use_class_label: net = tf.concat([net, label], -1) for layer_id in range(len(self.out_fc_states)): weight_name = "{}{}".format(W_name, layer_id) bias_name = "{}{}".format(b_name, layer_id) curr_weight = self.out_weight[weight_name] curr_bias = self.out_bias[bias_name] net = ne.fully_conn(net, curr_weight, curr_bias) if self.out_norm == "BATCH": net = ne.batch_norm(net, self.is_training) elif self.out_norm == "LAYER": net = ne.layer_norm(net, self.is_training) #net = ne.leaky_brelu(net, self.conv_leaky_ratio[layer_id], self.layer_low_bound, self.output_up_bound) # Nonlinear act net = ne.leaky_relu(net, self.out_leaky_ratio) out_channel_size = self.out_fc_states[-1]//h//w assert h*w*out_channel_size == self.out_fc_states[-1] net = tf.reshape(net, [-1, h, w, out_channel_size]) #net = ne.max_pool_2x2(net) # Pooling net = tf.identity(net, name='out_output') #import pdb; pdb.set_trace() return net
def _func(net, layer_id, postfix="", act_func="leaky"): weight_name = "{}{}{}".format(W_name, layer_id, postfix) bias_name = "{}{}{}".format(b_name, layer_id, postfix) curr_weight = self.enfc_weights[weight_name] curr_bias = self.enfc_biases[bias_name] net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias) # batch normalization if self.use_norm == "BATCH": net = ne.batch_norm(net, self.is_training, axis=1) elif self.use_norm == "LAYER": net = ne.layer_norm(net, self.is_training) #net = ne.leaky_brelu(net, self.enfc_leaky_ratio[layer_id], self.enfc_low_bound[layer_id], self.enfc_up_bound[layer_id]) # Nonlinear act if act_func == "leaky": net = ne.leaky_relu(net, self.enfc_leaky_ratio[layer_id]) elif act_func == "soft": net = tf.nn.softplus(net) #net = ne.drop_out(net, self.enfc_drop_rate[layer_id], self.is_training) return net
def _fc_layers(self, inputs, weights_dict, biases_dict, fc_leaky_ratio, fc_drop_rate, num_fc, W_name, b_name): net = inputs for layer_id in range(num_fc): weight_name = "{}{}".format(W_name, layer_id) bias_name = "{}{}".format(b_name, layer_id) curr_weight = weights_dict[weight_name] curr_bias = biases_dict[bias_name] net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias) # batch normalization if self.use_norm == "BATCH": net = ne.batch_norm(net, self.is_training, axis=-1) #net = ne.leaky_brelu(net, self.enfc_leaky_ratio[layer_id], self.enfc_low_bound[layer_id], self.enfc_up_bound[layer_id]) # Nonlinear act net = ne.leaky_relu(net, fc_leaky_ratio[layer_id]) net = ne.drop_out(net, fc_drop_rate[layer_id], self.is_training) #net = ne.elu(net) net = tf.identity(net, name='output') return net
def defc_layers(self, inputs, W_name="W_defc", b_name="b_defc"): net = inputs for layer_id in range(self.num_enfc): weight_name = "{}{}".format(W_name, layer_id) bias_name = "{}{}".format(b_name, layer_id) curr_weight = self.defc_weights[weight_name] curr_bias = self.defc_biases[bias_name] net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias) # batch normalization if self.use_batch_norm: net = ne.batch_norm(net, self.is_training, axis=1) #net = ne.leaky_brelu(net, self.defc_leaky_ratio[layer_id], self.layer_low_bound, self.layer_up_bound) # Nonlinear act net = ne.leaky_relu(net, self.defc_leaky_ratio[layer_id]) net = ne.drop_out(net, self.defc_drop_rate[layer_id], self.is_training) #net = ne.elu(net) net = tf.identity(net, name='output') net = tf.reshape(net, [-1] + self.decv_in_shape) return net
def enfc_layers(self, inputs, W_name="W_enfc", b_name="b_enfc"): net = tf.reshape(inputs, [ -1, self.conv_out_shape[0] * self.conv_out_shape[1] * self.conv_out_shape[2] ]) for layer_id in range(self.num_enfc): weight_name = "{}{}".format(W_name, layer_id) bias_name = "{}{}".format(b_name, layer_id) curr_weight = self.enfc_weights[weight_name] curr_bias = self.enfc_biases[bias_name] net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias) # batch normalization if self.use_norm == "BATCH": net = ne.batch_norm(net, self.is_training, axis=1) elif self.use_norm == "LAYER": net = ne.layer_norm(net, self.is_training) #net = ne.leaky_brelu(net, self.enfc_leaky_ratio[layer_id], self.enfc_low_bound[layer_id], self.enfc_up_bound[layer_id]) # Nonlinear act net = ne.leaky_relu(net, self.enfc_leaky_ratio[layer_id]) net = ne.drop_out(net, self.enfc_drop_rate[layer_id], self.is_training) #net = ne.elu(net) net = tf.identity(net, name='output') return net