def _ffn_layer(inputs, hidden_size, output_size, keep_prob=None, data_format="NHWC", dtype=None, scope=None): with tf.variable_scope(scope, default_name="ffn_layer", values=[inputs], dtype=dtype): with tf.variable_scope("input_layer"): hidden = linear(inputs, hidden_size, True, data_format=data_format) hidden = tf.nn.relu(hidden) if keep_prob and keep_prob < 1.0: hidden = tf.nn.dropout(hidden, keep_prob) with tf.variable_scope("output_layer"): output = linear(hidden, output_size, True, data_format=data_format) return output
def inference(self, input_): conv1 = layers.conv2d_same(input_, self.num_kernel, name='conv1') res_block1 = self.res_block(conv1, self.num_kernel * 2, is_downsizing=False, name='res_block1') res_block2 = self.res_block(res_block1, self.num_kernel * 4, is_downsizing=True, name='res_block2') res_block3 = self.res_block(res_block2, self.num_kernel * 8, is_downsizing=True, name='res_block3') act = self.res_act(res_block3) pool = layers.avg_pool(act, k_h=self.pool_kernel, k_w=self.pool_kernel, d_h=1, d_w=1, name='pool') flat = layers.flatten(pool, 'flat') linear = layers.linear(flat, self.num_class, name='linear') return linear
def inference(self, input_): conv1 = layers.conv2d_same_repeat(input_, self.kernel_num, num_repeat=2, name="down1") pool1 = layers.max_pool(conv1, name="pool1") conv2 = layers.conv2d_same_repeat(pool1, self.kernel_num * 2, num_repeat=2, name="down2") pool2 = layers.max_pool(conv2, name="pool2") conv3 = layers.conv2d_same_repeat(pool2, self.kernel_num * 4, num_repeat=3, name="down3") pool3 = layers.max_pool(conv3, name="pool3") conv4 = layers.conv2d_same_repeat(pool3, self.kernel_num * 8, num_repeat=3, name="down4") pool4 = layers.max_pool(conv4, name="pool4") conv5 = layers.conv2d_same_repeat(pool4, self.kernel_num * 8, num_repeat=3, name="down5") pool5 = layers.max_pool(conv5, name="pool5") flat = layers.flatten(pool5, 'flat') linear = layers.linear(flat, flat.get_shape().as_list()[-1], name='linear') logits = layers.linear(linear, self.num_class, name='logits') return logits
def inference(self, input_, reuse=False): with tf.variable_scope('ResNet') as scope: if reuse: scope.reuse_variables() conv1 = layers.conv2d_same_act(input_, self.num_kernel, k_h=7, k_w=7, d_h=2, d_w=2, activation_fn=self.act_fn, name='conv_1') pool1 = layers.max_pool(conv1, k_h=self.pool_kernel, k_w=self.pool_kernel, padding='SAME', name='pool1') layer_blocks = self.layer_repeat(pool1, self.layer_def, name='layers') pool2 = layers.global_avg_pool(layer_blocks, name='pool2') flat = layers.flatten(pool2, 'flat') linear = layers.linear(flat, self.num_class, name='linear') logit = tf.sigmoid(linear, name='logit') return logit
def deepatt_model(features, mode, params): hparams = params params = copy.copy(hparams) # disable dropout in evaluation/inference mode if mode != tf.contrib.learn.ModeKeys.TRAIN: params.attention_dropout = 0.0 params.residual_dropout = 0.0 params.relu_dropout = 0.0 vocab_size = len(params.vocabulary["inputs"]) label_size = len(params.vocabulary["targets"]) hidden_size = params.hidden_size feature_size = params.feature_size tok_seq = features["inputs"] pred_seq = features["preds"] mask = tf.to_float(tf.not_equal(tok_seq, 0)) # shared embedding and softmax weights initializer = None if mode == tf.contrib.learn.ModeKeys.TRAIN: if not params.use_global_initializer: initializer = tf.random_normal_initializer(0.0, feature_size ** -0.5) weights = tf.get_variable("weights", [2, feature_size], initializer=initializer) if mode == tf.contrib.learn.ModeKeys.TRAIN: if params.embedding is not None: initializer = lambda shape, dtype, partition_info: params.embedding else: initializer = None embedding = tf.get_variable("embedding", [vocab_size, feature_size], initializer=initializer, trainable=not params.fix_embedding) bias = tf.get_variable("bias", [hidden_size]) # id => embedding # src_seq: [batch, max_src_length] # tgt_seq: [batch, max_tgt_length] inputs = tf.gather(embedding, tok_seq) if mode == tf.contrib.learn.ModeKeys.INFER: if features.get("mask") is not None: keep_mask = features["mask"][:, :, None] unk_emb = features["embedding"] inputs = inputs * keep_mask + (1.0 - keep_mask) * unk_emb preds = tf.gather(weights, pred_seq) inputs = tf.concat([inputs, preds], -1) if params.multiply_embedding_mode == "sqrt_depth": inputs = inputs * (hidden_size ** 0.5) inputs = inputs * tf.expand_dims(mask, -1) # preparing encoder & decoder input encoder_input = tf.nn.bias_add(inputs, bias) if params.pos == "timing": encoder_input = ops.attention.add_timing_signal(encoder_input) elif params.pos == "embedding": initializer = tf.random_normal_initializer(0.0, hidden_size ** -0.5) embedding = tf.get_variable("position_embedding", [1000, hidden_size], initializer=initializer) indices = tf.range(tf.shape(features["inputs"])[1])[None, :] pos_emb = tf.gather(embedding, indices) pos_emb = tf.tile(pos_emb, [tf.shape(features["inputs"])[0], 1, 1]) encoder_input = encoder_input + pos_emb if params.residual_dropout: keep_prob = 1.0 - params.residual_dropout encoder_input = tf.nn.dropout(encoder_input, keep_prob) encoder_output = encoder(encoder_input, mask, params) initializer = None if mode == tf.contrib.learn.ModeKeys.TRAIN: if not params.use_global_initializer: initializer = tf.random_normal_initializer(0.0, hidden_size ** -0.5) with tf.variable_scope("prediction", initializer=initializer): logits = linear(encoder_output, label_size, True, scope="logits") if mode == tf.contrib.learn.ModeKeys.INFER: outputs = tf.to_int32(tf.argmax(logits, axis=-1)) return outputs, tf.nn.softmax(logits) labels = features["targets"] targets = features["targets"] logits = tf.reshape(logits, [-1, label_size]) labels = tf.reshape(labels, [-1]) # label smoothing ce = ops.layers.smoothed_softmax_cross_entropy_with_logits( logits=logits, labels=labels, label_smoothing=params.label_smoothing, normalize=True ) ce = tf.reshape(ce, tf.shape(targets)) cost = tf.reduce_sum(ce * mask) / tf.reduce_sum(mask) # greedy decoding if mode == tf.contrib.learn.ModeKeys.EVAL: outputs = tf.to_int32(tf.argmax(logits, axis=-1)) return cost, tf.reshape(outputs, tf.shape(targets)) return cost
def _inference(self, input_): conv1 = layers.conv2d_same_act(input_, 16, activation_fn=self.activation_fn, name='conv1') skip1 = layers.bottleneck_layer(conv1, 32, name='skip1') _, conv2 = layers.conv2d_same_repeat(conv1, 32, num_repeat=2, activation_fn=self.activation_fn, with_logit=True, name='conv2') res1 = tf.add(skip1, conv2, name='res1') res_act1 = self.res_act(res1) _, conv3 = layers.conv2d_same_repeat(res_act1, 32, num_repeat=2, activation_fn=self.activation_fn, with_logit=True, name='conv3') res2 = tf.add(conv3, res1, name='res2') res_act2 = self.res_act(res2) skip2 = layers.bottleneck_layer(res_act2, 64, d_h=2, d_w=2, name='skip2') conv4 = layers.conv2d_same_act(res_act2, 64, d_h=2, d_w=2, activation_fn=self.activation_fn, name='conv4') conv5 = layers.conv2d_same(conv4, 64, name='conv5') res3 = tf.add(skip2, conv5, name='res3') res_act3 = self.res_act(res3) _, conv6 = layers.conv2d_same_repeat(res_act1, 64, num_repeat=2, activation_fn=self.activation_fn, with_logit=True, name='conv3') res4 = tf.add(res3, conv6, name='res4') res_act4 = self.res_act(res4) skip3 = layers.bottleneck_layer(res_act4, 128, d_h=2, d_w=2, name='skip3') conv7 = layers.conv2d_same_act(res_act4, 128, d_h=2, d_w=2, activation_fn=self.activation_fn, name='conv7') conv8 = layers.conv2d_same(conv7, 128, name='conv8') res5 = tf.add(skip3, conv8, name='res5') res_act5 = self.res_act(res5) _, conv9 = layers.conv2d_same_repeat(res_act5, 128, num_repeat=2, activation_fn=self.activation_fn, with_logit=True, name='conv9') res6 = tf.add(res5, conv9, name='res6') res_act6 = self.res_act(res6) pool = layers.avg_pool(res_act6, k_h=8, k_w=8, d_h=1, d_w=1, name='pool') flat = layers.flatten(pool, 'flat') linear = layers.linear(flat, self.num_class, name='linear') return linear
def multi_mask_tensorized_self_attn(rep_tensor, rep_mask, final_mask_ft, hn, head_num, keep_prob=None, scope=None): data_format = "NHWC" assert hn % head_num == 0, "hn (%d) must be divisible by the number of " \ "attention heads (%d)." % (hn, head_num) head_dim = int(hn / head_num) bs, sl = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1] with tf.variable_scope(scope or 'proposed_self_attention'): combined = linear(rep_tensor, 3 * hn, True, True, data_format=data_format, scope="qkv_transform") q, k, v = tf.split(combined, 3, 2) # bs,sl,hn q = split_head(q, head_num) k = split_head(k, head_num) v = split_head(v, head_num) # num,bs,sl,dim with tf.name_scope("dot_product_attention"): dot_logits = tf.matmul(q, k, transpose_b=True) * (head_dim**-0.5 ) # num,bs,sl,sl e_dot_logits = tf.exp(new_exp_mask(dot_logits, final_mask_ft)) # num,bs,sl,sl with tf.variable_scope("s2t_multi_dim_attention"): multi_logits_before = linear(rep_tensor, hn, True, True, data_format=data_format, scope="multi_logits_before") multi_logits = split_head(multi_logits_before, head_num) # num,bs,sl,dim e_multi_logits = tf.exp( new_exp_mask( # mul,bs,sl,dim multi_logits, rep_mask, multi_head=True, high_dim=True)) with tf.name_scope("hybrid_attn"): accum_z_deno = tf.matmul(e_dot_logits, e_multi_logits) # num,bs,sl,dim accum_z_deno = tf.where( # in case of nan tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)), accum_z_deno, tf.ones_like(accum_z_deno)) if keep_prob is not None and keep_prob < 1.0: real_keep_prob = keep_prob e_multi_logits = tf.nn.dropout(e_multi_logits, real_keep_prob) e_dot_logits = tf.nn.dropout(e_dot_logits, real_keep_prob) rep_mul_score = new_mask( v, rep_mask, multi_head=True, high_dim=True) * e_multi_logits accum_rep_mul_score = tf.matmul(e_dot_logits, rep_mul_score) attn_res = accum_rep_mul_score / accum_z_deno with tf.variable_scope("output"): attn_output = combine_head(attn_res) # bs,sl,hn final_out = linear(attn_output, hn, True, data_format=data_format, scope="output_transform") final_out = new_mask(final_out, rep_mask, high_dim=True) # bs,sl,hn return final_out