def __init__(self, char_domain_size, char_embedding_dim, hidden_dim, embeddings=None): self.char_domain_size = char_domain_size self.embedding_size = char_embedding_dim self.hidden_dim = hidden_dim # char embedding input self.input_chars = tf.placeholder(tf.int64, [None, None], name="input_chars") # padding mask # self.input_mask = tf.placeholder(tf.float32, [None, None], name="input_mask") self.batch_size = tf.placeholder(tf.int32, None, name="batch_size") self.max_seq_len = tf.placeholder(tf.int32, None, name="max_seq_len") self.max_tok_len = tf.placeholder(tf.int32, None, name="max_tok_len") self.input_dropout_keep_prob = tf.placeholder_with_default( 1.0, [], name="input_dropout_keep_prob") # sequence lengths self.sequence_lengths = tf.placeholder(tf.int32, [None, None], name="sequence_lengths") self.token_lengths = tf.placeholder(tf.int32, [None, None], name="tok_lengths") self.output_size = 2 * self.hidden_dim print("LSTM char embedding model") print("embedding dim: ", self.embedding_size) print("out dim: ", self.output_size) # set the pad token to a constant 0 vector # self.char_zero_pad = tf.constant(0.0, dtype=tf.float32, shape=[1, self.embedding_size]) # Embedding layer shape = (char_domain_size - 1, self.embedding_size) self.char_embeddings = tf_utils.initialize_embeddings( shape, name="char_embeddings", pretrained=embeddings) self.outputs = self.forward(self.input_chars, self.input_dropout_keep_prob, reuse=False)
def __init__(self, char_domain_size, char_embedding_dim, hidden_dim, filter_width, embeddings=None): self.char_domain_size = char_domain_size self.embedding_size = char_embedding_dim self.hidden_dim = hidden_dim self.filter_width = filter_width # char embedding input self.input_chars = tf.placeholder(tf.int64, [None, None], name="input_chars") # padding mask # self.input_mask = tf.placeholder(tf.float32, [None, None], name="input_mask") self.batch_size = tf.placeholder(tf.int32, None, name="batch_size") self.max_seq_len = tf.placeholder(tf.int32, None, name="max_seq_len") self.max_tok_len = tf.placeholder(tf.int32, None, name="max_tok_len") self.input_dropout_keep_prob = tf.placeholder_with_default( 1.0, [], name="input_dropout_keep_prob") # sequence lengths self.sequence_lengths = tf.placeholder(tf.int32, [None, None], name="sequence_lengths") self.token_lengths = tf.placeholder(tf.int32, [None, None], name="tok_lengths") print("CNN char embedding model:") print("embedding dim: ", self.embedding_size) print("out dim: ", self.hidden_dim) char_embeddings_shape = (self.char_domain_size - 1, self.embedding_size) self.char_embeddings = tf_utils.initialize_embeddings( char_embeddings_shape, name="char_embeddings", pretrained=embeddings) self.outputs = self.forward(self.input_chars, self.input_dropout_keep_prob, reuse=False)
def forward(self, input_x1, input_x2, max_seq_len, hidden_dropout_keep_prob, input_dropout_keep_prob, middle_dropout_keep_prob, reuse=True): word_embeddings = tf.nn.embedding_lookup(self.w_e, input_x1) with tf.variable_scope("forward", reuse=reuse): input_list = [word_embeddings] input_size = self.embedding_size if self.use_characters: input_list.append(self.char_embeddings) input_size += self.char_size if self.use_shape: shape_embeddings_shape = (self.shape_domain_size - 1, self.shape_size) w_s = tf_utils.initialize_embeddings(shape_embeddings_shape, name="w_s") shape_embeddings = tf.nn.embedding_lookup(w_s, input_x2) input_list.append(shape_embeddings) input_size += self.shape_size input_feats = tf.concat(axis=2, values=input_list) # self.input_feats_expanded = tf.expand_dims(self.input_feats, 1) input_feats_expanded_drop = tf.nn.dropout(input_feats, input_dropout_keep_prob) total_output_width = 2 * self.hidden_dim with tf.name_scope("bilstm"): # selected_col_embeddings = tf.nn.embedding_lookup(token_embeddings, self.token_batch) fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim, state_is_tuple=True) bwd_cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim, state_is_tuple=True) lstm_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fwd_cell, cell_bw=bwd_cell, dtype=tf.float32, inputs=input_feats_expanded_drop, parallel_iterations=50, sequence_length=self.flat_sequence_lengths) hidden_outputs = tf.concat(axis=2, values=lstm_outputs) h_concat_flat = tf.reshape(hidden_outputs, [-1, total_output_width]) # Add dropout with tf.name_scope("middle_dropout"): h_drop = tf.nn.dropout(h_concat_flat, middle_dropout_keep_prob) # second projection with tf.name_scope("tanh_proj"): w_tanh = tf_utils.initialize_weights( [total_output_width, self.hidden_dim], "w_tanh", init_type="xavier") b_tanh = tf.get_variable(initializer=tf.constant( 0.01, shape=[self.hidden_dim]), name="b_tanh") self.l2_loss += tf.nn.l2_loss(w_tanh) self.l2_loss += tf.nn.l2_loss(b_tanh) h2_concat_flat = tf.nn.xw_plus_b(h_drop, w_tanh, b_tanh, name="h2_tanh") h2_tanh = tf_utils.apply_nonlinearity(h2_concat_flat, self.nonlinearity) # Add dropout with tf.name_scope("hidden_dropout"): h2_drop = tf.nn.dropout(h2_tanh, hidden_dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): w_o = tf_utils.initialize_weights( [self.hidden_dim, self.num_classes], "w_o", init_type="xavier") b_o = tf.get_variable(initializer=tf.constant( 0.01, shape=[self.num_classes]), name="b_o") self.l2_loss += tf.nn.l2_loss(w_o) self.l2_loss += tf.nn.l2_loss(b_o) scores = tf.nn.xw_plus_b(h2_drop, w_o, b_o, name="scores") unflat_scores = tf.reshape( scores, tf.stack([self.batch_size, max_seq_len, self.num_classes])) return unflat_scores, hidden_outputs
def __init__(self, num_classes, vocab_size, shape_domain_size, char_domain_size, char_size, embedding_size, shape_size, nonlinearity, viterbi, hidden_dim, char_embeddings, embeddings=None): self.num_classes = num_classes self.shape_domain_size = shape_domain_size self.char_domain_size = char_domain_size self.char_size = char_size self.embedding_size = embedding_size self.shape_size = shape_size self.hidden_dim = hidden_dim self.nonlinearity = nonlinearity self.char_embeddings = char_embeddings self.viterbi = viterbi # word embedding input self.input_x1 = tf.placeholder(tf.int64, [None, None], name="input_x1") # shape embedding input self.input_x2 = tf.placeholder(tf.int64, [None, None], name="input_x2") # labels self.input_y = tf.placeholder(tf.int64, [None, None], name="input_y") # padding mask self.input_mask = tf.placeholder(tf.float32, [None, None], name="input_mask") self.batch_size = tf.placeholder(tf.int32, None, name="batch_size") self.max_seq_len = tf.placeholder(tf.int32, None, name="max_seq_len") # sequence lengths self.sequence_lengths = tf.placeholder(tf.int32, [None, None], name="sequence_lengths") # dropout and l2 penalties self.middle_dropout_keep_prob = tf.placeholder_with_default( 1.0, [], name="middle_dropout_keep_prob") self.hidden_dropout_keep_prob = tf.placeholder_with_default( 1.0, [], name="hidden_dropout_keep_prob") self.input_dropout_keep_prob = tf.placeholder_with_default( 1.0, [], name="input_dropout_keep_prob") self.word_dropout_keep_prob = tf.placeholder_with_default( 1.0, [], name="word_dropout_keep_prob") self.l2_penalty = tf.placeholder_with_default(0.0, [], name="l2_penalty") self.projection = tf.placeholder_with_default(False, [], name="projection") self.drop_penalty = tf.placeholder_with_default(0.0, [], name="drop_penalty") # Keeping track of l2 regularization loss (optional) self.l2_loss = tf.constant(0.0) # set the pad token to a constant 0 vector self.word_zero_pad = tf.constant(0.0, dtype=tf.float32, shape=[1, embedding_size]) self.shape_zero_pad = tf.constant(0.0, dtype=tf.float32, shape=[1, shape_size]) self.char_zero_pad = tf.constant(0.0, dtype=tf.float32, shape=[1, char_size]) self.use_characters = char_size != 0 self.use_shape = shape_size != 0 if self.viterbi: self.transition_params = tf.get_variable( "transitions", [num_classes, num_classes]) # Embedding layer # with tf.device('/cpu:0'), tf.name_scope("embedding"): word_embeddings_shape = (vocab_size - 1, embedding_size) self.w_e = tf_utils.initialize_embeddings(word_embeddings_shape, name="w_e", pretrained=embeddings) nonzero_elements = tf.not_equal(self.sequence_lengths, tf.zeros_like(self.sequence_lengths)) count_nonzero_per_row = tf.reduce_sum(tf.to_int32(nonzero_elements), axis=1) self.flat_sequence_lengths = tf.add( tf.reduce_sum(self.sequence_lengths, 1), tf.scalar_mul(2, count_nonzero_per_row)) self.unflat_scores, self.hidden_layer = self.forward( self.input_x1, self.input_x2, self.max_seq_len, self.hidden_dropout_keep_prob, self.input_dropout_keep_prob, self.middle_dropout_keep_prob, reuse=False) # Calculate mean cross-entropy loss with tf.name_scope("loss"): labels = tf.cast(self.input_y, 'int32') if viterbi: log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( self.unflat_scores, labels, self.flat_sequence_lengths, transition_params=self.transition_params) # self.transition_params = transition_params self.loss = tf.reduce_mean(-log_likelihood) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.unflat_scores, labels=labels) masked_losses = tf.multiply(losses, self.input_mask) self.loss = tf.div(tf.reduce_sum(masked_losses), tf.reduce_sum(self.input_mask)) self.loss += self.l2_penalty * self.l2_loss self.unflat_no_dropout_scores, _ = self.forward( self.input_x1, self.input_x2, self.max_seq_len, 1.0, 1.0, 1.0) drop_loss = tf.nn.l2_loss( tf.subtract(self.unflat_scores, self.unflat_no_dropout_scores)) self.loss += self.drop_penalty * drop_loss # Accuracy with tf.name_scope("predictions"): if viterbi: self.predictions = self.unflat_scores else: self.predictions = tf.argmax(self.unflat_scores, 2)
def forward(self, input_x1, input_x2, max_seq_len, hidden_dropout_keep_prob, input_dropout_keep_prob, middle_dropout_keep_prob, reuse=True): block_unflat_scores = [] with tf.variable_scope("forward", reuse=reuse): word_embeddings = tf.nn.embedding_lookup(self.w_e, input_x1) input_list = [word_embeddings] input_size = self.embedding_size if self.use_characters: char_embeddings_masked = tf.multiply(self.char_embeddings, tf.expand_dims(self.input_mask, 2)) input_list.append(char_embeddings_masked) input_size += self.char_size if self.use_shape: shape_embeddings_shape = (self.shape_domain_size - 1, self.shape_size) w_s = tf_utils.initialize_embeddings(shape_embeddings_shape, name="w_s") shape_embeddings = tf.nn.embedding_lookup(w_s, input_x2) input_list.append(shape_embeddings) input_size += self.shape_size initial_filter_width = self.layers_map[0][1]['width'] initial_num_filters = self.layers_map[0][1]['filters'] filter_shape = [1, initial_filter_width, input_size, initial_num_filters] initial_layer_name = "conv0" if not reuse: print(input_list) print("Adding initial layer %s: width: %d; filters: %d" % ( initial_layer_name, initial_filter_width, initial_num_filters)) input_feats = tf.concat(axis=2, values=input_list) input_feats_expanded = tf.expand_dims(input_feats, 1) input_feats_expanded_drop = tf.nn.dropout(input_feats_expanded, input_dropout_keep_prob) print("input feats expanded drop", input_feats_expanded_drop.get_shape()) # first projection of embeddings w = tf_utils.initialize_weights(filter_shape, initial_layer_name + "_w", init_type='xavier', gain='relu') b = tf.get_variable(initial_layer_name + "_b", initializer=tf.constant(0.01, shape=[initial_num_filters])) conv0 = tf.nn.conv2d(input_feats_expanded_drop, w, strides=[1, 1, 1, 1], padding="SAME", name=initial_layer_name) h0 = tf_utils.apply_nonlinearity(tf.nn.bias_add(conv0, b), 'relu') initial_inputs = [h0] last_dims = initial_num_filters # Stacked atrous convolutions last_output = tf.concat(axis=3, values=initial_inputs) for block in range(self.repeats): print("last out shape", last_output.get_shape()) print("last dims", last_dims) hidden_outputs = [] total_output_width = 0 reuse_block = (block != 0 and self.share_repeats) or reuse block_name_suff = "" if self.share_repeats else str(block) inner_last_dims = last_dims inner_last_output = last_output with tf.variable_scope("block" + block_name_suff, reuse=reuse_block): for layer_name, layer in self.layers_map: dilation = layer['dilation'] filter_width = layer['width'] num_filters = layer['filters'] initialization = layer['initialization'] take_layer = layer['take'] if not reuse: print("Adding layer %s: dilation: %d; width: %d; filters: %d; take: %r" % ( layer_name, dilation, filter_width, num_filters, take_layer)) with tf.name_scope("atrous-conv-%s" % layer_name): # [filter_height, filter_width, in_channels, out_channels] filter_shape = [1, filter_width, inner_last_dims, num_filters] w = tf_utils.initialize_weights(filter_shape, layer_name + "_w", init_type=initialization, gain=self.nonlinearity, divisor=self.num_classes) b = tf.get_variable(layer_name + "_b", initializer=tf.constant(0.0 if initialization == "identity" or initialization == "varscale" else 0.001, shape=[num_filters])) # h = tf_utils.residual_layer(inner_last_output, w, b, dilation, self.nonlinearity, self.batch_norm, layer_name + "_r", # self.batch_size, max_seq_len, self.res_activation, self.training) \ # if last_output != input_feats_expanded_drop \ # else tf_utils.residual_layer(inner_last_output, w, b, dilation, self.nonlinearity, False, layer_name + "_r", # self.batch_size, max_seq_len, 0, self.training) conv = tf.nn.atrous_conv2d(inner_last_output, w, rate=dilation, padding="SAME", name=layer_name) conv_b = tf.nn.bias_add(conv, b) h = tf_utils.apply_nonlinearity(conv_b, self.nonlinearity) # so, only apply "take" to last block (may want to change this later) if take_layer: hidden_outputs.append(h) total_output_width += num_filters inner_last_dims = num_filters inner_last_output = h h_concat = tf.concat(axis=3, values=hidden_outputs) last_output = tf.nn.dropout(h_concat, middle_dropout_keep_prob) last_dims = total_output_width h_concat_squeeze = tf.squeeze(h_concat, [1]) h_concat_flat = tf.reshape(h_concat_squeeze, [-1, total_output_width]) # Add dropout with tf.name_scope("hidden_dropout"): h_drop = tf.nn.dropout(h_concat_flat, hidden_dropout_keep_prob) def do_projection(): # Project raw outputs down with tf.name_scope("projection"): projection_width = int(total_output_width / (2 * len(hidden_outputs))) w_p = tf_utils.initialize_weights([total_output_width, projection_width], "w_p", init_type="xavier") b_p = tf.get_variable("b_p", initializer=tf.constant(0.01, shape=[projection_width])) projected = tf.nn.xw_plus_b(h_drop, w_p, b_p, name="projected") projected_nonlinearity = tf_utils.apply_nonlinearity(projected, self.nonlinearity) return projected_nonlinearity, projection_width # only use projection if we wanted to, and only apply middle dropout here if projection input_to_pred, proj_width = do_projection() if self.projection else (h_drop, total_output_width) input_to_pred_drop = tf.nn.dropout(input_to_pred, middle_dropout_keep_prob) if self.projection else input_to_pred # Final (unnormalized) scores and predictions with tf.name_scope("output" + block_name_suff): w_o = tf_utils.initialize_weights([proj_width, self.num_classes], "w_o", init_type="xavier") b_o = tf.get_variable("b_o", initializer=tf.constant(0.01, shape=[self.num_classes])) self.l2_loss += tf.nn.l2_loss(w_o) self.l2_loss += tf.nn.l2_loss(b_o) scores = tf.nn.xw_plus_b(input_to_pred_drop, w_o, b_o, name="scores") unflat_scores = tf.reshape(scores, tf.stack([self.batch_size, max_seq_len, self.num_classes])) block_unflat_scores.append(unflat_scores) return block_unflat_scores, h_concat_squeeze
def __init__(self, num_classes, vocab_size, shape_domain_size, char_domain_size, char_size, embedding_size, shape_size, nonlinearity, layers_map, viterbi, projection, loss, margin, repeats, share_repeats, char_embeddings, embeddings=None): self.num_classes = num_classes self.shape_domain_size = shape_domain_size self.char_domain_size = char_domain_size self.char_size = char_size self.embedding_size = embedding_size self.shape_size = shape_size self.nonlinearity = nonlinearity self.layers_map = layers_map self.projection = projection self.which_loss = loss self.margin = margin self.char_embeddings = char_embeddings self.repeats = repeats self.viterbi = viterbi self.share_repeats = share_repeats # word embedding input self.input_x1 = tf.placeholder(tf.int64, [None, None], name="input_x1") # shape embedding input self.input_x2 = tf.placeholder(tf.int64, [None, None], name="input_x2") # labels self.input_y = tf.placeholder(tf.int64, [None, None], name="input_y") # padding mask self.input_mask = tf.placeholder(tf.float32, [None, None], name="input_mask") # dims self.batch_size = tf.placeholder(tf.int32, None, name="batch_size") self.max_seq_len = tf.placeholder(tf.int32, None, name="max_seq_len") # sequence lengths self.sequence_lengths = tf.placeholder(tf.int32, [None, None], name="sequence_lengths") # dropout and l2 penalties self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob") self.input_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="input_dropout_keep_prob") self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob") self.training = tf.placeholder_with_default(False, [], name="training") self.l2_penalty = tf.placeholder_with_default(0.0, [], name="l2_penalty") self.drop_penalty = tf.placeholder_with_default(0.0, [], name="drop_penalty") self.l2_loss = tf.constant(0.0) self.use_characters = char_size != 0 self.use_shape = shape_size != 0 self.ones = tf.ones([self.batch_size, self.max_seq_len, self.num_classes]) if self.viterbi: self.transition_params = tf.get_variable("transitions", [num_classes, num_classes]) word_embeddings_shape = (vocab_size - 1, embedding_size) self.w_e = tf_utils.initialize_embeddings(word_embeddings_shape, name="w_e", pretrained=embeddings, old=False) self.block_unflat_scores, self.hidden_layer = self.forward(self.input_x1, self.input_x2, self.max_seq_len, self.hidden_dropout_keep_prob, self.input_dropout_keep_prob, self.middle_dropout_keep_prob, reuse=False) # CalculateMean cross-entropy loss with tf.name_scope("loss"): self.loss = tf.constant(0.0) self.block_unflat_no_dropout_scores, _ = self.forward(self.input_x1, self.input_x2, self.max_seq_len, 1.0, 1.0, 1.0) labels = tf.cast(self.input_y, 'int32') if self.which_loss == "block": for unflat_scores, unflat_no_dropout_scores in zip(self.block_unflat_scores, self.block_unflat_no_dropout_scores): self.loss += self.compute_loss(unflat_scores, unflat_no_dropout_scores, labels) self.unflat_scores = self.block_unflat_scores[-1] else: self.unflat_scores = self.block_unflat_scores[-1] self.unflat_no_dropout_scores = self.block_unflat_no_dropout_scores[-1] self.loss = self.compute_loss(self.unflat_scores, self.unflat_no_dropout_scores, labels) with tf.name_scope("predictions"): if viterbi: self.predictions = self.unflat_scores else: self.predictions = tf.argmax(self.unflat_scores, 2)