Example #1
0
    def __init__(self, config, word_ix_to_pat_ixs, need_reuse=False):
        # get hyperparameters
        batch_size = config.batch_size
        num_steps = config.num_steps
        max_word_len = config.max_word_len
        pat_emb_dim = config.pat_emb_dim
        highway_size = config.highway_size
        init_scale = config.init_scale
        num_sampled = config.num_sampled
        pat_vocab_size = config.pat_vocab_size
        hidden_size = config.hidden_size
        num_layers = config.num_layers
        word_vocab_size = config.word_vocab_size
        drop_x = config.drop_x
        drop_i = config.drop_i
        drop_h = config.drop_h
        drop_o = config.drop_o
        weight_decay = config.weight_decay

        # pattern embedding matrix
        with tf.variable_scope('pat_emb', reuse=need_reuse):
            self.pat_embedding = tf.get_variable(
                "pat_embedding", [pat_vocab_size, pat_emb_dim],
                dtype=tf.float32,
                initializer=tf.random_uniform_initializer(
                    -init_scale, init_scale))

        # placeholders for training data and labels
        self.x = tf.placeholder(tf.int32,
                                [batch_size, num_steps, max_word_len])
        self.y = tf.placeholder(tf.int32, [batch_size, num_steps])
        y_float = tf.cast(self.y, tf.float32)

        # we first embed patterns ...
        words_embedded = tf.nn.embedding_lookup(self.pat_embedding, self.x)
        words_embedded = tf.reshape(words_embedded,
                                    [-1, max_word_len, pat_emb_dim])
        # ... and then sum pattern vectors to get a word vector
        words_embedded_sum = tf.reduce_sum(words_embedded, axis=1)

        # we feed the word vector into a stack of two HW layers ...
        def highway_layer(highway_inputs):
            transf_weights = tf.get_variable(
                'transf_weights', [highway_size, highway_size],
                initializer=tf.random_uniform_initializer(
                    -init_scale, init_scale),
                dtype=tf.float32)
            transf_biases = tf.get_variable(
                'transf_biases', [highway_size],
                initializer=tf.random_uniform_initializer(
                    -2 - 0.01, -2 + 0.01),
                dtype=tf.float32)
            highw_weights = tf.get_variable(
                'highw_weights', [highway_size, highway_size],
                initializer=tf.random_uniform_initializer(
                    -init_scale, init_scale),
                dtype=tf.float32)
            highw_biases = tf.get_variable(
                'highw_biases', [highway_size],
                initializer=tf.random_uniform_initializer(
                    -init_scale, init_scale),
                dtype=tf.float32)
            transf_gate = tf.nn.sigmoid(
                tf.matmul(highway_inputs, transf_weights) + transf_biases)
            highw_output = tf.multiply(transf_gate,
              tf.nn.relu(tf.matmul(highway_inputs, highw_weights) + highw_biases)) \
              + tf.multiply(tf.ones([highway_size], dtype=tf.float32) - transf_gate,
              highway_inputs)
            return highw_output, transf_gate

        with tf.variable_scope('highway1', reuse=need_reuse):
            highw1_output, self.t1 = highway_layer(words_embedded_sum)

        with tf.variable_scope('highway2', reuse=need_reuse):
            highw2_output, self.t2 = highway_layer(highw1_output)

        highw_output_reshaped = tf.reshape(highw2_output,
                                           [batch_size, num_steps, -1])
        if not need_reuse:
            highw_output_reshaped = tf.nn.dropout(highw_output_reshaped,
                                                  1 - drop_x,
                                                  [batch_size, num_steps, 1])

        # ... and then process it with a stack of two LSTMs
        lstm_input = tf.unstack(highw_output_reshaped, axis=1)

        # basic LSTM cell
        def lstm_cell():
            return tf.contrib.rnn.LSTMCell(hidden_size,
                                           forget_bias=1.0,
                                           reuse=need_reuse)

        cells = []
        for i in range(num_layers):
            with tf.variable_scope('layer' + str(i)):
                if not need_reuse:
                    if i == 0:
                        cells.append(
                            my_dropout.MyDropoutWrapper(
                                lstm_cell(),
                                input_keep_prob=1 - drop_i,
                                state_keep_prob=1 - drop_h,
                                output_keep_prob=1 - drop_o,
                                variational_recurrent=True,
                                input_size=highway_size,
                                dtype=tf.float32))
                    else:
                        cells.append(
                            my_dropout.MyDropoutWrapper(
                                lstm_cell(),
                                state_keep_prob=1 - drop_h,
                                output_keep_prob=1 - drop_o,
                                variational_recurrent=True,
                                input_size=hidden_size,
                                dtype=tf.float32))
                else:
                    cells.append(lstm_cell())
        self.cell = tf.contrib.rnn.MultiRNNCell(cells)

        self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32)
        with tf.variable_scope('lstm_rnn', reuse=need_reuse):
            outputs, self.state = tf.contrib.rnn.static_rnn(
                self.cell,
                lstm_input,
                dtype=tf.float32,
                initial_state=self.init_state)
        output = tf.reshape(tf.concat(axis=1, values=outputs),
                            [-1, hidden_size])

        # finally we predict the next word according to a softmax normalization
        with tf.variable_scope('softmax_params', reuse=need_reuse):
            weights = tf.get_variable(
                'weights', [word_vocab_size, hidden_size],
                initializer=tf.random_uniform_initializer(
                    -init_scale, init_scale),
                dtype=tf.float32)
            biases = tf.get_variable('biases', [word_vocab_size],
                                     initializer=tf.random_uniform_initializer(
                                         -init_scale, init_scale),
                                     dtype=tf.float32)

        # and compute the cross-entropy between labels and predictions
        logits = tf.matmul(output, tf.transpose(weights)) + biases
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self.y, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
        self.cost = tf.reduce_sum(loss) / batch_size

        if not need_reuse:
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n([
                tf.nn.l2_loss(v)
                for v in tvars if 'bias' not in v.name and 'Bias' not in v.name
            ]) * weight_decay
            self.full_cost = self.cost + l2_loss
Example #2
0
    def __init__(self, config, word_ix_to_morph_ixs, need_reuse=False):
        # get hyperparameters
        batch_size = config.batch_size
        num_steps = config.num_steps
        self.max_word_len = max_word_len = config.max_word_len
        self.morph_emb_dim = morph_emb_dim = config.morph_emb_dim
        self.highway_size = highway_size = config.highway_size
        self.init_scale = init_scale = config.init_scale
        num_sampled = config.num_sampled
        morph_vocab_size = config.morph_vocab_size
        hidden_size = config.hidden_size
        num_layers = config.num_layers
        word_vocab_size = config.word_vocab_size
        drop_x = config.drop_x
        drop_i = config.drop_i
        drop_h = config.drop_h
        drop_o = config.drop_o

        # morpheme embedding matrix
        with tf.variable_scope('morph_emb', reuse=need_reuse):
            self.morph_embedding = tf.get_variable(
                "morph_embedding", [morph_vocab_size, morph_emb_dim],
                dtype=tf.float32)

        # placeholders for training data and labels
        self.x = tf.placeholder(tf.int32,
                                [batch_size, num_steps, max_word_len])
        self.y = tf.placeholder(tf.int32, [batch_size, num_steps])
        y_float = tf.cast(self.y, tf.float32)

        # we first embed morphemes ...
        words_embedded = tf.nn.embedding_lookup(self.morph_embedding, self.x)
        words_emb_as_list = tf.unstack(words_embedded, axis=1)

        words_list = []
        for word_emb in words_emb_as_list:
            #summing up morpheme embeddings
            morph_sum = tf.reduce_sum(word_emb, axis=1)
            words_list.append(morph_sum)

        words_packed_reshaped = tf.reshape(tf.stack(words_list, axis=1),
                                           [-1, morph_emb_dim])

        # we project word vectors to match the dimensionality of
        # the highway layer
        if morph_emb_dim != highway_size:
            with tf.variable_scope('projection', reuse=need_reuse):
                proj_w = tf.get_variable('proj_w',
                                         [morph_emb_dim, highway_size],
                                         dtype=tf.float32)
            words_packed_reshaped_proj = tf.matmul(words_packed_reshaped,
                                                   proj_w)
        else:
            words_packed_reshaped_proj = words_packed_reshaped

        # we feed the word vector into a stack of two HW layers ...
        with tf.variable_scope('highway1', reuse=need_reuse):
            highw1_output = self.highway_layer(words_packed_reshaped_proj)

        with tf.variable_scope('highway2', reuse=need_reuse):
            highw2_output = self.highway_layer(highw1_output)

        highw_output_reshaped = tf.reshape(highw2_output,
                                           [batch_size, num_steps, -1])
        if not need_reuse:
            highw_output_reshaped = tf.nn.dropout(highw_output_reshaped,
                                                  1 - drop_x,
                                                  [batch_size, num_steps, 1])

        # ... and then process it with a stack of two LSTMs
        lstm_input = tf.unstack(highw_output_reshaped, axis=1)

        # basic LSTM cell
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(hidden_size,
                                           forget_bias=1.0,
                                           reuse=need_reuse)

        cells = []
        for i in range(num_layers):
            with tf.variable_scope('layer' + str(i)):
                if not need_reuse:
                    if i == 0:
                        cells.append(
                            my_dropout.MyDropoutWrapper(
                                lstm_cell(),
                                input_keep_prob=1 - drop_i,
                                state_keep_prob=1 - drop_h,
                                output_keep_prob=1 - drop_o,
                                variational_recurrent=True,
                                input_size=highway_size,
                                dtype=tf.float32))
                    else:
                        cells.append(
                            my_dropout.MyDropoutWrapper(
                                lstm_cell(),
                                state_keep_prob=1 - drop_h,
                                output_keep_prob=1 - drop_o,
                                variational_recurrent=True,
                                input_size=hidden_size,
                                dtype=tf.float32))
                else:
                    cells.append(lstm_cell())
        self.cell = tf.nn.rnn_cell.MultiRNNCell(cells)

        self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32)
        with tf.variable_scope('lstm_rnn', reuse=need_reuse):
            outputs, self.state = tf.contrib.rnn.static_rnn(
                self.cell,
                lstm_input,
                dtype=tf.float32,
                initial_state=self.init_state)
        output = tf.reshape(tf.concat(axis=1, values=outputs),
                            [-1, hidden_size])

        # finally we predict the next word according to a softmax normalization
        if config.reuse_emb:
            self.morph_embedding_out = self.morph_embedding
        with tf.variable_scope('softmax_params', reuse=need_reuse):
            if morph_emb_dim != highway_size:
                proj_w_out = tf.get_variable('proj_w_out',
                                             [morph_emb_dim, highway_size],
                                             dtype=tf.float32)
            if highway_size != hidden_size:
                proj2_w_out = tf.get_variable('proj2_w_out',
                                              [highway_size, hidden_size],
                                              dtype=tf.float32)
            if not config.reuse_emb:
                self.morph_embedding_out = tf.get_variable(
                    "morph_embedding_out", [morph_vocab_size, morph_emb_dim],
                    dtype=tf.float32)
            biases = tf.get_variable('biases', [word_vocab_size],
                                     dtype=tf.float32)

        if config.ssm == 1 and not need_reuse:

            def _sum_rows(x):
                """Returns a vector summing up each row of the matrix x."""
                # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
                # a matrix.  The gradient of _sum_rows(x) is more efficient than
                # reduce_sum(x, 1)'s gradient in today's implementation. Therefore,
                # we use _sum_rows(x) in the nce_loss() computation since the loss
                # is mostly used for training.
                cols = tf.shape(x)[1]
                ones_shape = tf.stack([cols, 1])
                ones = tf.ones(ones_shape, x.dtype)
                return tf.reshape(tf.matmul(x, ones), [-1])

            labels = tf.cast(tf.reshape(self.y, [-1, 1]), tf.int64)
            labels_flat = tf.reshape(labels, [-1])

            # Sample the negative labels.
            #   sampled shape: [num_sampled] tensor
            #   true_expected_count shape = [batch_size, 1] tensor
            #   sampled_expected_count shape = [num_sampled] tensor
            sampled_values = tf.nn.log_uniform_candidate_sampler(
                true_classes=labels,
                num_true=1,
                num_sampled=num_sampled,
                unique=True,
                range_max=word_vocab_size)
            sampled, true_expected_count, sampled_expected_count = (
                tf.stop_gradient(s) for s in sampled_values)
            sampled = tf.cast(sampled, tf.int64)

            # labels_flat is a [batch_size * num_steps] tensor
            # sampled is a [num_sampled] int tensor
            all_ids = tf.concat([labels_flat, sampled], 0)

            words_in_morphs = tf.nn.embedding_lookup(word_ix_to_morph_ixs,
                                                     all_ids)
            words_in_morphs_embedded = tf.nn.embedding_lookup(
                self.morph_embedding_out, words_in_morphs)

            all_w_full = tf.reduce_sum(words_in_morphs_embedded, axis=1)
            if morph_emb_dim != highway_size:
                all_w_proj = tf.matmul(all_w_full, proj_w_out)
            else:
                all_w_proj = all_w_full
            with tf.variable_scope(
                    'highway1' if config.reuse_hw1 else 'highway1_out',
                    reuse=config.reuse_hw1 or need_reuse):
                all_w_hw1_output = self.highway_layer(all_w_proj)
            with tf.variable_scope(
                    'highway2' if config.reuse_hw2 else 'highway2_out',
                    reuse=config.reuse_hw2 or need_reuse):
                all_w_hw2_output = self.highway_layer(all_w_hw1_output)
            if highway_size != hidden_size:
                all_w = tf.matmul(all_w_hw2_output, proj2_w_out)
            else:
                all_w = all_w_hw2_output

            # true_w shape is [batch_size * num_true, dim]
            true_w = tf.slice(all_w, [0, 0],
                              tf.stack([tf.shape(labels_flat)[0], -1]))
            sampled_w = tf.slice(all_w, tf.stack([tf.shape(labels_flat)[0],
                                                  0]), [-1, -1])

            sampled_logits = tf.matmul(output, sampled_w, transpose_b=True)

            all_b = tf.nn.embedding_lookup(biases, all_ids)
            # true_b is a [batch_size * num_true] tensor
            # sampled_b is a [num_sampled] float tensor
            true_b = tf.slice(all_b, [0], tf.shape(labels_flat))
            sampled_b = tf.slice(all_b, tf.shape(labels_flat), [-1])

            dim = tf.shape(true_w)[1:2]
            new_true_w_shape = tf.concat([[-1, 1], dim], 0)
            row_wise_dots = tf.multiply(tf.expand_dims(output, 1),
                                        tf.reshape(true_w, new_true_w_shape))
            # We want the row-wise dot plus biases which yields a
            # [batch_size, num_true] tensor of true_logits.
            dots_as_matrix = tf.reshape(row_wise_dots, tf.concat([[-1], dim],
                                                                 0))
            true_logits = tf.reshape(_sum_rows(dots_as_matrix), [-1, 1])
            true_b = tf.reshape(true_b, [-1, 1])
            true_logits += true_b
            sampled_logits += sampled_b

            true_logits -= tf.log(true_expected_count)
            sampled_logits -= tf.log(sampled_expected_count)

            # Construct output logits and labels. The true labels/logits start at col 0.
            out_logits = tf.concat([true_logits, sampled_logits], 1)
            # true_logits is a float tensor, ones_like(true_logits) is a float tensor
            # of ones. We then divide by num_true to ensure the per-example labels sum
            # to 1.0, i.e. form a proper probability distribution.
            out_labels = tf.concat(
                [tf.ones_like(true_logits),
                 tf.zeros_like(sampled_logits)], 1)
            loss = tf.nn.softmax_cross_entropy_with_logits(labels=out_labels,
                                                           logits=out_logits)

        else:
            words_in_morphs_embedded = tf.nn.embedding_lookup(
                self.morph_embedding_out, word_ix_to_morph_ixs)
            weights_full = tf.reduce_sum(words_in_morphs_embedded, axis=1)
            if morph_emb_dim != highway_size:
                weights_proj = tf.matmul(weights_full, proj_w_out)
            else:
                weights_proj = weights_full
            with tf.variable_scope(
                    'highway1' if config.reuse_hw1 else 'highway1_out',
                    reuse=config.reuse_hw1 or need_reuse):
                weights_highw1_output = self.highway_layer(weights_proj)
            with tf.variable_scope(
                    'highway2' if config.reuse_hw2 else 'highway2_out',
                    reuse=config.reuse_hw2 or need_reuse):
                weights_highw2_output = self.highway_layer(
                    weights_highw1_output)
            if highway_size != hidden_size:
                weights = tf.matmul(weights_highw2_output, proj2_w_out)
            else:
                weights = weights_highw2_output

            logits = tf.matmul(output, tf.transpose(weights)) + biases
            loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
                [logits], [tf.reshape(self.y, [-1])],
                [tf.ones([batch_size * num_steps], dtype=tf.float32)])

        self.cost = tf.reduce_sum(loss) / batch_size
Example #3
0
    def __init__(self, config, word_ix_to_syl_ixs, need_reuse=False):
        # get hyperparameters
        batch_size = config.batch_size
        num_steps = config.num_steps
        self.max_word_len = max_word_len = config.max_word_len
        self.syl_emb_dim = syl_emb_dim = config.syl_emb_dim
        self.highway_size = highway_size = config.highway_size
        self.init_scale = init_scale = config.init_scale
        num_sampled = config.num_sampled
        syl_vocab_size = config.syl_vocab_size
        hidden_size = config.hidden_size
        num_layers = config.num_layers
        word_vocab_size = config.word_vocab_size
        drop_x = config.drop_x
        drop_i = config.drop_i
        drop_h = config.drop_h
        drop_o = config.drop_o

        # syllable embedding matrix
        with tf.variable_scope('syl_emb', reuse=need_reuse):
            self.syl_embedding = tf.get_variable("syl_embedding",
                                                 [syl_vocab_size, syl_emb_dim],
                                                 dtype=tf.float32)

        # placeholders for training data and labels
        self.x = tf.placeholder(tf.int32,
                                [batch_size, num_steps, max_word_len])
        self.y = tf.placeholder(tf.int32, [batch_size, num_steps])
        y_float = tf.cast(self.y, tf.float32)

        # we first embed sylemes ...
        words_embedded = tf.nn.embedding_lookup(self.syl_embedding, self.x)
        words_embedded_unrolled = tf.unstack(words_embedded, axis=1)

        # ... and then concatenate them to obtain word vectors
        words_list = []
        for word in words_embedded_unrolled:
            syls = tf.unstack(word, axis=1)
            syls_concat = tf.concat(axis=1, values=syls)
            words_list.append(syls_concat)

        words_packed_reshaped = tf.reshape(tf.stack(words_list, axis=1),
                                           [-1, max_word_len * syl_emb_dim])

        # we project word vectors to match the dimensionality of
        # the highway layer
        with tf.variable_scope('projection', reuse=need_reuse):
            proj_w = tf.get_variable(
                'proj_w', [max_word_len * syl_emb_dim, highway_size],
                dtype=tf.float32)
        words_packed_reshaped_proj = tf.matmul(words_packed_reshaped, proj_w)

        # we feed the word vector into a stack of two HW layers ...
        with tf.variable_scope('highway1', reuse=need_reuse):
            highw1_output = self.highway_layer(words_packed_reshaped_proj)

        with tf.variable_scope('highway2', reuse=need_reuse):
            highw2_output = self.highway_layer(highw1_output)

        highw_output_reshaped = tf.reshape(highw2_output,
                                           [batch_size, num_steps, -1])
        if not need_reuse:
            highw_output_reshaped = tf.nn.dropout(highw_output_reshaped,
                                                  1 - drop_x,
                                                  [batch_size, num_steps, 1])

        # ... and then process it with a stack of two LSTMs
        lstm_input = tf.unstack(highw_output_reshaped, axis=1)

        # basic LSTM cell
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(hidden_size,
                                           forget_bias=1.0,
                                           reuse=need_reuse)

        cells = []
        for i in range(num_layers):
            with tf.variable_scope('layer' + str(i)):
                if not need_reuse:
                    if i == 0:
                        cells.append(
                            my_dropout.MyDropoutWrapper(
                                lstm_cell(),
                                input_keep_prob=1 - drop_i,
                                state_keep_prob=1 - drop_h,
                                output_keep_prob=1 - drop_o,
                                variational_recurrent=True,
                                input_size=highway_size,
                                dtype=tf.float32))
                    else:
                        cells.append(
                            my_dropout.MyDropoutWrapper(
                                lstm_cell(),
                                state_keep_prob=1 - drop_h,
                                output_keep_prob=1 - drop_o,
                                variational_recurrent=True,
                                input_size=hidden_size,
                                dtype=tf.float32))
                else:
                    cells.append(lstm_cell())
        self.cell = tf.nn.rnn_cell.MultiRNNCell(cells)

        self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32)
        with tf.variable_scope('lstm_rnn', reuse=need_reuse):
            outputs, self.state = tf.contrib.rnn.static_rnn(
                self.cell,
                lstm_input,
                dtype=tf.float32,
                initial_state=self.init_state)
        output = tf.reshape(tf.concat(axis=1, values=outputs),
                            [-1, hidden_size])

        # finally we predict the next word according to a softmax normalization
        if config.reuse_emb:
            self.syl_embedding_out = self.syl_embedding
            proj_w_out = proj_w
        with tf.variable_scope('softmax_params', reuse=need_reuse):
            if highway_size != hidden_size:
                proj2_w_out = tf.get_variable('proj2_w_out',
                                              [highway_size, hidden_size],
                                              dtype=tf.float32)
            if not config.reuse_emb:
                self.syl_embedding_out = tf.get_variable(
                    "syl_embedding_out", [syl_vocab_size, syl_emb_dim],
                    dtype=tf.float32)
                proj_w_out = tf.get_variable(
                    'proj_w_out', [max_word_len * syl_emb_dim, highway_size],
                    dtype=tf.float32)
            biases = tf.get_variable('biases', [word_vocab_size],
                                     dtype=tf.float32)

        words_in_syls = []
        for word_ix in range(word_vocab_size):
            words_in_syls.append(word_ix_to_syl_ixs[word_ix])
        words_in_syls_embedded = tf.nn.embedding_lookup(
            self.syl_embedding_out, words_in_syls)
        syls = tf.unstack(words_in_syls_embedded, axis=1)
        weights_full = tf.concat(syls, axis=1)
        weights_proj = tf.matmul(weights_full, proj_w_out)
        with tf.variable_scope(
                'highway1' if config.reuse_hw1 else 'highway1_out',
                reuse=config.reuse_hw1 or need_reuse):
            weights_highw1_output = self.highway_layer(weights_proj)
        with tf.variable_scope(
                'highway2' if config.reuse_hw2 else 'highway2_out',
                reuse=config.reuse_hw2 or need_reuse):
            weights_highw2_output = self.highway_layer(weights_highw1_output)
        if highway_size != hidden_size:
            weights = tf.matmul(weights_highw2_output, proj2_w_out)
        else:
            weights = weights_highw2_output

        # and compute the cross-entropy between labels and predictions
        if config.ssm == 1 and not need_reuse:
            loss = tf.nn.sampled_softmax_loss(weights,
                                              biases,
                                              tf.reshape(y_float, [-1, 1]),
                                              output,
                                              num_sampled,
                                              word_vocab_size,
                                              partition_strategy="div")
        else:
            logits = tf.matmul(output, tf.transpose(weights)) + biases
            loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
                [logits], [tf.reshape(self.y, [-1])],
                [tf.ones([batch_size * num_steps], dtype=tf.float32)])
        self.cost = tf.reduce_sum(loss) / batch_size
Example #4
0
  def __init__(self, config, word_ix_to_char_ixs, need_reuse=False):
    # get hyperparameters
    batch_size = config.batch_size
    num_steps = config.num_steps
    self.max_word_len = max_word_len = config.max_word_len
    self.char_emb_dim = char_emb_dim = config.char_emb_dim
    self.highway_size = highway_size = config.highway_size
    self.init_scale = init_scale = config.init_scale
    num_sampled = config.num_sampled
    char_vocab_size = config.char_vocab_size
    hidden_size = config.hidden_size
    num_layers = config.num_layers
    word_vocab_size = config.word_vocab_size
    drop_x = config.drop_x
    drop_i = config.drop_i
    drop_h = config.drop_h
    drop_o = config.drop_o
    filter_widths = config.filter_widths
    filters_per_width = config.filters_per_width
    cnn_output_dim = config.cnn_output_dim

    # charlable embedding matrix
    with tf.variable_scope('char_emb', reuse=need_reuse):
      self.char_embedding = tf.get_variable("char_embedding", 
        [char_vocab_size, char_emb_dim], dtype=tf.float32)
    
    # placeholders for training data and labels
    self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len])
    self.y = tf.placeholder(tf.int32, [batch_size, num_steps])
    y_float = tf.cast(self.y, tf.float32)
    
    # we first embed characters ...
    words_embedded = tf.nn.embedding_lookup(self.char_embedding, self.x)
    words_embedded = tf.reshape(words_embedded, [-1, max_word_len, char_emb_dim])
    
    def conv_layer(cur_char_inputs, filt_shape, bias_shape):
      new_filt_shape = [1, 1] + filt_shape
      filt = tf.get_variable('filt', new_filt_shape)
      bias = tf.get_variable('bias', bias_shape)
      cur_char_inputs = tf.expand_dims(tf.expand_dims(cur_char_inputs, 1), 1)
      conv = tf.nn.conv3d(cur_char_inputs, filt, [1, 1, 1, 1, 1], padding='VALID')
      feature_map = tf.nn.tanh(conv + bias)
      feature_map_reshaped = tf.squeeze(feature_map, axis=1)
      pool = tf.nn.max_pool(feature_map_reshaped, [1, 1, max_word_len - filt_shape[0] + 1, 1], [1, 1, 1, 1], 'VALID')
      return(tf.squeeze(pool, axis=[1,2]))

    def words_filter(cur_char_inputs):
      pools = []
      for w in filter_widths:
        with tf.variable_scope('filter' + str(w)):
          pools.append(conv_layer(cur_char_inputs, [w, char_emb_dim, filters_per_width[w]], [filters_per_width[w]]))
      return tf.concat(axis=1, values=pools)
       
    with tf.variable_scope('cnn_output', reuse=need_reuse) as scope:
      cnn_output = tf.reshape(words_filter(words_embedded), [-1, cnn_output_dim])
    
    # we feed the word vector into a stack of two HW layers ...
    with tf.variable_scope('highway1', reuse=need_reuse):
      highw1_output = self.highway_layer(cnn_output)
    
    with tf.variable_scope('highway2', reuse=need_reuse):
      highw2_output = self.highway_layer(highw1_output)
        
    highw_output_reshaped = tf.reshape(highw2_output, 
                                       [batch_size, num_steps, -1])
    if not need_reuse:
      highw_output_reshaped = tf.nn.dropout(
          highw_output_reshaped, 1-drop_x, [batch_size, num_steps, 1])
    
    # ... and then process it with a stack of two LSTMs
    lstm_input = tf.unstack(highw_output_reshaped, axis=1)
    # basic LSTM cell
    def lstm_cell():
      return tf.nn.rnn_cell.LSTMCell(hidden_size, 
                                     forget_bias=1.0,
                                     reuse=need_reuse)
    cells = []
    for i in range(num_layers):
      with tf.variable_scope('layer' + str(i)):
        if not need_reuse:
          if i == 0:
            cells.append(
                my_dropout.MyDropoutWrapper(lstm_cell(), 
                                            input_keep_prob=1-drop_i,
                                            state_keep_prob=1-drop_h,
                                            output_keep_prob=1-drop_o,
                                            variational_recurrent=True,
                                            input_size=highway_size,
                                            dtype=tf.float32))
          else:
            cells.append(
                my_dropout.MyDropoutWrapper(lstm_cell(),
                                            state_keep_prob=1-drop_h,
                                            output_keep_prob=1-drop_o,
                                            variational_recurrent=True,
                                            input_size=hidden_size,
                                            dtype=tf.float32))
        else:
          cells.append(lstm_cell())
    self.cell = tf.nn.rnn_cell.MultiRNNCell(cells)
    
    self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32)
    with tf.variable_scope('lstm_rnn', reuse=need_reuse):
      outputs, self.state = tf.contrib.rnn.static_rnn(
          self.cell, 
          lstm_input, 
          dtype=tf.float32, 
          initial_state=self.init_state)
    output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size])

    # finally we predict the next word according to a softmax normalization
    if config.reuse_emb:
      self.char_embedding_out = self.char_embedding
    with tf.variable_scope('softmax_params', reuse=need_reuse):
      if highway_size != hidden_size:
        proj2_w_out = tf.get_variable('proj2_w_out', 
          [highway_size, hidden_size],
          dtype=tf.float32)
      if not config.reuse_emb:
        self.char_embedding_out = tf.get_variable("char_embedding_out", 
          [char_vocab_size, char_emb_dim], dtype=tf.float32)
      biases = tf.get_variable('biases', [word_vocab_size], dtype=tf.float32)

    slice_num = word_vocab_size // 1000
    for i in range(slice_num):
      word_in_chars = []
      a = i * 1000
      b = i * 1000 + 1000 if i != slice_num - 1 else word_vocab_size
      for word_ix in range(a, b):
        word_in_chars.append(word_ix_to_char_ixs[word_ix])
      word_in_chars_embedded = tf.nn.embedding_lookup(self.char_embedding_out, word_in_chars)
      with tf.variable_scope('cnn_output' if config.reuse_cnn else 'cnn_output_out', 
                             reuse=config.reuse_cnn or (i > 0) or need_reuse):
        weight_full = words_filter(word_in_chars_embedded)
      with tf.variable_scope('highway1' if config.reuse_hw1 else 'highway1_out', 
                             reuse=config.reuse_hw1 or (i > 0) or need_reuse):
        weight_highw1_output = self.highway_layer(weight_full)
      with tf.variable_scope('highway2' if config.reuse_hw2 else 'highway2_out', 
                             reuse=config.reuse_hw2 or (i > 0) or need_reuse):
        weight_highw2_output = self.highway_layer(weight_highw1_output)
      if highway_size != hidden_size:
        weight = tf.matmul(weight_highw2_output, proj2_w_out)
      else:
        weight = weight_highw2_output
      if i == 0:
        weights = weight
      else:
        weights = tf.concat(values=[weights, weight], axis=0)
      del word_in_chars[:]    
    self.weights = weights  
        
    # and compute the cross-entropy between labels and predictions
    if config.ssm == 1 and not need_reuse:
      loss = tf.nn.sampled_softmax_loss(weights, biases, 
        tf.reshape(y_float, [-1, 1]), output, num_sampled, word_vocab_size, 
        partition_strategy="div")
    else:
      logits = tf.matmul(output, tf.transpose(weights)) + biases
      loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
              [logits],
              [tf.reshape(self.y, [-1])],
              [tf.ones([batch_size * num_steps], dtype=tf.float32)])
    self.cost = tf.reduce_sum(loss) / batch_size