def block(self, wembed, kernel_sz, num_filt, num_layers, reuse=False): dilation_rate = 2 initialization = 'identity' nonlinearity = 'relu' input_tensor = wembed with tf.variable_scope('iterated-block', reuse=reuse): for i in range(0, num_layers): if i == num_layers-1: dilation_rate = 1 filter_shape = [1, kernel_sz, num_filt, num_filt] w = tf_utils.initialize_weights(filter_shape, 'conv-'+ str(i) + "_w", init_type=initialization, gain=nonlinearity, divisor=self.num_classes) b = tf.get_variable('conv-'+ str(i) + "_b", initializer=tf.constant(0.0 if initialization == "identity" or initialization == "varscale" else 0.001, shape=[num_filt])) conv = tf.nn.atrous_conv2d(input_tensor, w, rate=dilation_rate**i, padding="SAME", name='conv-'+ str(i)) conv_b = tf.nn.bias_add(conv, b) nonlinearity = tf_utils.apply_nonlinearity(conv_b, "relu") input_tensor = nonlinearity + input_tensor tf.summary.histogram('conv-'+str(i), input_tensor) # input_tensor = tf.nn.relu(input_tensor, name="relu-"+str(i)) return input_tensor
def do_projection(): # Project raw outputs down with tf.name_scope("projection"): projection_width = int(total_output_width/(2*len(hidden_outputs))) w_p = tf_utils.initialize_weights([total_output_width, projection_width], "w_p", init_type="xavier") b_p = tf.get_variable("b_p", initializer=tf.constant(0.01, shape=[projection_width])) projected = tf.nn.xw_plus_b(h_drop, w_p, b_p, name="projected") projected_nonlinearity = tf_utils.apply_nonlinearity(projected, self.nonlinearity) return projected_nonlinearity, projection_width
def forward(self, hidden_keep, input_keep, middle_keep, reuse=True): """ used to determine the actual graph. returns (intermediate_probs, probs). technically probs is the last layer of the intermediate probs. """ block_unflat_scores = [] with tf.variable_scope("forward", reuse=reuse): with tf.control_dependencies([self.we0]): wembed = tf.nn.embedding_lookup(self.Ww, self.x, name="embeddings") with tf.control_dependencies([self.ce0]): xch_seq = tensorToSeq(self.xch) cembed_seq = [] for i, xch_i in enumerate(xch_seq): cembed_seq.append(shared_char_word(self.Wc, xch_i, self.filtsz, self.char_dsz, self.wsz, None if (i == 0 and not reuse) else True)) word_char = seqToTensor(cembed_seq) input_feats = tf.concat([wembed, word_char], 2) input_feats_expanded = tf.expand_dims(input_feats, 1) input_feats_expanded_drop = tf.nn.dropout(input_feats_expanded, self.input_dropout_keep_prob) # first projection of embeddings filter_shape = [1, self.kernel_size, input_feats.get_shape()[2], self.num_filt] w = tf_utils.initialize_weights(filter_shape, "conv_start" + "_w", init_type='xavier', gain='relu') b = tf.get_variable("conv_start" + "_b", initializer=tf.constant(0.01, shape=[self.num_filt])) conv0 = tf.nn.conv2d(input_feats_expanded_drop, w, strides=[1, 1, 1, 1], padding="SAME", name="conv_start") h0 = tf_utils.apply_nonlinearity(tf.nn.bias_add(conv0, b), 'relu') initial_inputs = [h0] last_dims = self.num_filt self.share_repeats = True self.projection = False # Stacked atrous convolutions last_output = tf.concat(axis=3, values=initial_inputs) for iteration in range(self.num_iterations): hidden_outputs = [] total_output_width = self.num_filt reuse_block = (iteration != 0) block_name_suff = "" if self.share_repeats else str(block) inner_last_dims = last_dims inner_last_output = last_output with tf.variable_scope("block" + block_name_suff, reuse=reuse_block): block_output = self.block(inner_last_output, self.kernel_size, self.num_filt, self.num_layers, reuse=reuse_block) #legacy strubell logic. we only grab the last layer of the block here. always. h_concat = tf.concat(axis=3, values=[block_output]) last_output = tf.nn.dropout(h_concat, self.middle_dropout_keep_prob) last_dims = total_output_width h_concat_squeeze = tf.squeeze(h_concat, [1]) h_concat_flat = tf.reshape(h_concat_squeeze, [-1, total_output_width]) # Add dropout with tf.name_scope("hidden_dropout"): h_drop = tf.nn.dropout(h_concat_flat, self.hidden_dropout_keep_prob) def do_projection(): # Project raw outputs down with tf.name_scope("projection"): projection_width = int(total_output_width/(2*len(hidden_outputs))) w_p = tf_utils.initialize_weights([total_output_width, projection_width], "w_p", init_type="xavier") b_p = tf.get_variable("b_p", initializer=tf.constant(0.01, shape=[projection_width])) projected = tf.nn.xw_plus_b(h_drop, w_p, b_p, name="projected") projected_nonlinearity = tf_utils.apply_nonlinearity(projected, self.nonlinearity) return projected_nonlinearity, projection_width # only use projection if we wanted to, and only apply middle dropout here if projection input_to_pred, proj_width = do_projection() if self.projection else (h_drop, total_output_width) input_to_pred_drop = tf.nn.dropout(input_to_pred, self.middle_dropout_keep_prob) if self.projection else input_to_pred # Final (unnormalized) scores and predictions with tf.name_scope("output"+block_name_suff): w_o = tf_utils.initialize_weights([proj_width, self.num_classes], "w_o", init_type="xavier") b_o = tf.get_variable("b_o", initializer=tf.constant(0.01, shape=[self.num_classes])) self.l2_loss += tf.nn.l2_loss(w_o) self.l2_loss += tf.nn.l2_loss(b_o) scores = tf.nn.xw_plus_b(input_to_pred_drop, w_o, b_o, name="scores") unflat_scores = tf.reshape(scores, tf.stack([-1, self.mxlen, self.num_classes])) block_unflat_scores.append(unflat_scores) # probs = unflat_scores # best = tf.argmax(self.probs, 2) # intermediate_probs = tf.stack(block_unflat_scores, -1) return block_unflat_scores, unflat_scores