コード例 #1
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
 def forward_scan(self, x):
     h0 = shared_zero_matrix((self.hidden_dim, ), 'h0_forward')
     c0 = shared_zero_matrix((self.hidden_dim, ), 'c0_forward')
     hs, _ = theano.scan(
         fn=self._step,
         sequences=x,
         outputs_info=[h0, c0],
         non_sequences=[self.W, self.U, self.b],
     )
     return hs[0]
コード例 #2
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
 def backward_scan(self, x):
     h0_backward = shared_zero_matrix(self.hidden_dim, 'h0_backward')
     c0_backward = shared_zero_matrix(self.hidden_dim, 'c0_backward')
     h_backwards, _ = theano.scan(
         fn=self._step,
         sequences=x,
         outputs_info=[h0_backward, c0_backward],
         non_sequences=[self.W_backward, self.U_backward, self.b_backward],
         go_backwards=True,
     )
     return h_backwards[::-1]
コード例 #3
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
 def backward_scan_batch(self, x, mask, batch_size):
     h0_backward = shared_zero_matrix((batch_size, self.hidden_dim),
                                      'h0_backward')
     c0_backward = shared_zero_matrix((batch_size, self.hidden_dim),
                                      'c0_backward')
     h_backwards, _ = theano.scan(
         fn=self._step_batch,
         sequences=[T.transpose(x, (1, 0, 2)),
                    T.transpose(mask, (1, 0))],
         outputs_info=[h0_backward, c0_backward],
         non_sequences=[self.W_backward, self.U_backward, self.b_backward],
         go_backwards=True,
     )
     return T.transpose(h_backwards[0], (1, 0, 2))[:, ::-1]
コード例 #4
0
ファイル: optimizer.py プロジェクト: JocelynSong/SSBRAE
 def get_update(self, loss, params, norm_exc_params=[]):
     logger.info("Update Parameters: %s" % params)
     updates = OrderedDict({})
     accumulators = OrderedDict({})
     grad_params = []
     for param in params:
         accumulators[param] = shared_zero_matrix(param.get_value().shape,
                                                  name="acc_%s" %
                                                  param.name)
         gp = T.grad(loss, param)
         grad_params.append(gp)
     for param, gp in zip(params, grad_params):
         exp_sr = accumulators[param]
         up_exp_sr = exp_sr + T.sqr(gp).sum()
         updates[exp_sr] = up_exp_sr
         step = (self.lr / (T.sqrt(up_exp_sr) + self.epsilon)) * gp
         stepped_param = param - step
         param_name = param.name
         if self.norm_lim > 0 and (param.get_value(borrow=True).ndim
                                   == 2) and (param_name
                                              not in norm_exc_params):
             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
             desired_norms = T.clip(col_norms, 0, T.sqrt(self.norm_lim))
             scale = desired_norms / (1e-7 + col_norms)
             updates[param] = stepped_param * scale
         else:
             updates[param] = stepped_param
     return updates
コード例 #5
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
 def forward_scan_batch(self, x, mask, batch_size):
     h0 = shared_zero_matrix((
         batch_size,
         self.hidden_dim,
     ), 'h0_forward')
     c0 = shared_zero_matrix((
         batch_size,
         self.hidden_dim,
     ), 'c0_forward')
     hs, _ = theano.scan(
         fn=self._step_batch,
         sequences=[T.transpose(x, (1, 0, 2)),
                    T.transpose(mask, (1, 0))],
         outputs_info=[h0, c0],
         non_sequences=[self.W, self.U, self.b],
     )
     return T.transpose(hs[0], (1, 0, 2))
コード例 #6
0
    def __init__(self, num_in, initializer=default_initializer):
        self.W = shared_rand_matrix(shape=(num_in, 1),
                                    name="logistic_W",
                                    initializer=initializer)
        self.b = shared_zero_matrix(np.asarray([0]), name='logistic_b')
        self.params = [self.W, self.b]

        self.l1_norm = T.sum(T.abs_(self.W))
        self.l2_norm = T.sum(self.W**2)
コード例 #7
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation,
                 gates=("sigmoid", "sigmoid", "sigmoid"),
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        super(BiLSTMEncoder,
              self).__init__(in_dim, hidden_dim, pooling, activation, gates,
                             prefix, initializer, dropout, verbose)
        self.out_dim = hidden_dim * 2
        # Composition Function Weight -- Gates
        # W [in, forget, output, recurrent]
        self.W_forward, self.W_forward.name = self.W, prefix + "W_forward"
        self.W_backward = shared_rand_matrix(
            (self.hidden_dim * 4, self.in_dim), prefix + 'W_backward',
            initializer)
        # U [in, forget, output, recurrent]

        self.U_forward, self.U_forward.name = self.U, prefix + "U_forward"
        self.U_backward = shared_rand_matrix(
            (self.hidden_dim * 4, self.hidden_dim), prefix + 'U_backward',
            initializer)
        # b [in, forget, output, recurrent]
        self.b_forward, self.b_forward.name = self.b, prefix + "b_forward"
        self.b_backward = shared_zero_matrix((self.hidden_dim * 4, ),
                                             prefix + 'b_backward')

        self.params = [
            self.W_forward, self.U_forward, self.b_forward, self.W_backward,
            self.U_backward, self.b_backward
        ]
        self.norm_params = [
            self.W_forward, self.U_forward, self.W_backward, self.U_backward
        ]
        self.l1_norm = T.sum(
            [T.sum(T.abs_(param)) for param in self.norm_params])
        self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params])

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Input Gate:       %s' % self.in_gate.method)
            logger.debug('Forget Gate:      %s' % self.forget_gate.method)
            logger.debug('Output Gate:      %s' % self.out_gate.method)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)
コード例 #8
0
    def __init__(self, num_in, num_out, initializer=default_initializer):
        self.num_in = num_in
        self.num_out = num_out

        self.W = shared_rand_matrix(shape=(num_in, num_out),
                                    name="softmax_W",
                                    initializer=initializer)
        self.b = shared_zero_matrix((num_out, ), 'softmax_b')
        self.params = [self.W, self.b]
        self.l1_norm = T.sum(T.abs_(self.W))
        self.l2_norm = T.sum(self.W**2)
コード例 #9
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation,
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        super(BiRecurrentEncoder,
              self).__init__(in_dim, hidden_dim, pooling, activation, prefix,
                             initializer, dropout, verbose)
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        self.out_dim = hidden_dim * 2
        # Forward Direction - Backward Direction
        # Feed-Forward Matrix (hidden, in)
        self.W_forward = self.W
        self.W_forward.name = prefix + "W_forward"
        self.W_backward = shared_rand_matrix(
            (self.hidden_dim, self.in_dim), prefix + 'W_backward', initializer)
        # Bias Term (hidden,)
        self.b_forward = self.b
        self.b_forward.name = prefix + "b_forward"
        self.b_backward = shared_zero_matrix((self.hidden_dim, ),
                                             prefix + 'b_backward')
        # Recurrent Matrix (hidden, hidden)
        self.U_forward = self.U
        self.U_forward.name = prefix + "U_forward"
        self.U_backward = shared_rand_matrix(
            (self.hidden_dim, self.hidden_dim), prefix + 'U_backward',
            initializer)

        self.params = [
            self.W_forward, self.W_backward, self.U_forward, self.U_backward,
            self.b_forward, self.b_backward
        ]
        self.norm_params = [
            self.W_forward, self.W_backward, self.U_forward, self.U_backward
        ]
        # L1, L2 Norm
        self.l1_norm = T.sum(
            [T.sum(T.abs_(param)) for param in self.norm_params])
        self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params])

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)
コード例 #10
0
ファイル: optimizer.py プロジェクト: JocelynSong/SSBRAE
 def get_update(self, loss, params, norm_exc_params=[]):
     logger.info("Update Parameters: %s" % params)
     rho = self.lr
     epsilon = self.epsilon
     norm_lim = self.norm_lim
     updates = OrderedDict({})
     exp_sqr_grads = OrderedDict({})
     exp_sqr_ups = OrderedDict({})
     gparams = []
     for param in params:
         exp_sqr_grads[param] = shared_zero_matrix(param.get_value().shape,
                                                   name="exp_grad_%s" %
                                                   param.name)
         gp = T.grad(loss, param)
         exp_sqr_ups[param] = shared_zero_matrix(param.get_value().shape,
                                                 name="exp_ups_%s" %
                                                 param.name)
         gparams.append(gp)
     for param, gp in zip(params, gparams):
         exp_sg = exp_sqr_grads[param]
         exp_su = exp_sqr_ups[param]
         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
         updates[exp_sg] = up_exp_sg
         step = -(T.sqrt(exp_su + epsilon) /
                  T.sqrt(up_exp_sg + epsilon)) * gp
         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
         stepped_param = param + step
         param_name = param.name
         if self.norm_lim > 0 and (param.get_value(borrow=True).ndim
                                   == 2) and (param_name
                                              not in norm_exc_params):
             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
             desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
             scale = desired_norms / (1e-7 + col_norms)
             updates[param] = stepped_param * scale
         else:
             updates[param] = stepped_param
     return updates
コード例 #11
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation,
                 gates=("sigmoid", "sigmoid", "sigmoid"),
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = hidden_dim
        self.pooling = pooling
        self.act = Activation(activation)
        self.in_gate, self.forget_gate, self.out_gate = Activation(
            gates[0]), Activation(gates[1]), Activation(gates[2])
        self.dropout = dropout

        # W [in, forget, output, recurrent] (4 * hidden, in)
        self.W = shared_rand_matrix((self.hidden_dim * 4, self.in_dim),
                                    prefix + 'W', initializer)
        # U [in, forget, output, recurrent] (4 * hidden, hidden)
        self.U = shared_rand_matrix((self.hidden_dim * 4, self.hidden_dim),
                                    prefix + 'U', initializer)
        # b [in, forget, output, recurrent] (4 * hidden,)
        self.b = shared_zero_matrix((self.hidden_dim * 4, ), prefix + 'b')

        self.params = [self.W, self.U, self.b]
        self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U))
        self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Input Gate:       %s' % self.in_gate.method)
            logger.debug('Forget Gate:      %s' % self.forget_gate.method)
            logger.debug('Output Gate:      %s' % self.out_gate.method)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)
コード例 #12
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation,
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        self.in_dim = in_dim
        self.out_dim = hidden_dim
        self.hidden_dim = hidden_dim
        self.pooling = pooling
        self.dropout = dropout
        self.act = Activation(activation)
        # Composition Function Weight
        # Feed-Forward Matrix (hidden, in)
        self.W = shared_rand_matrix((self.hidden_dim, self.in_dim),
                                    prefix + 'W_forward', initializer)
        # Bias Term (hidden)
        self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_forward')
        # Recurrent Matrix (hidden, hidden)
        self.U = shared_rand_matrix((self.hidden_dim, self.hidden_dim),
                                    prefix + 'U_forward', initializer)

        self.params = [self.W, self.U, self.b]
        self.norm_params = [self.W, self.U]

        # L1, L2 Norm
        self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U))
        self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)
コード例 #13
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
 def forward_sequence_batch(self, x, mask, batch_size):
     """
     :param x: (batch, max_len, dim)
     :param mask:  (batch, max_len)
     :param batch_size:
     """
     h0 = shared_zero_matrix((batch_size, self.hidden_dim), 'h0')
     hs, _ = theano.scan(
         fn=self._step_batch,
         sequences=[
             T.transpose(
                 x, (1, 0,
                     2)),  # (batch, max_len, dim) -> (max_len, batch, dim)
             T.transpose(mask, (1, 0))
         ],  # (batch, max_len) -> (max_len, batch)
         outputs_info=[h0],
         non_sequences=[self.W, self.U, self.b],
     )
     # (max_len, batch, dim) -> (batch, max_len, dim)
     return T.transpose(hs, (1, 0, 2))
コード例 #14
0
ファイル: recurrent.py プロジェクト: JocelynSong/SSBRAE
    def __init__(self,
                 lookup_table,
                 recurrent_encoder,
                 in_dim,
                 hidden_dim,
                 num_label,
                 pooling,
                 activation,
                 batch_size=64,
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        self.batch_size = batch_size
        word_index = T.imatrix()  # (batch, max_len)
        gold_truth = T.ivector()  # (batch, 1)
        rnn_encoder = recurrent_encoder(in_dim=in_dim,
                                        hidden_dim=hidden_dim,
                                        pooling=pooling,
                                        activation=activation,
                                        initializer=initializer,
                                        dropout=dropout,
                                        verbose=verbose)
        mask = (word_index > 0) * one_float32
        word_embedding = lookup_table.W[word_index]
        rnn_output = rnn_encoder.forward_batch(word_embedding, mask,
                                               batch_size)
        classifier = SoftmaxClassifier(num_in=rnn_encoder.out_dim,
                                       num_out=num_label,
                                       initializer=initializer)
        classifier_output = classifier.forward(rnn_output)
        loss = classifier.loss(rnn_output, gold_truth)
        params = lookup_table.params + classifier.params + rnn_encoder.params
        sgd_optimizer = AdaGradOptimizer(lr=0.95, norm_lim=16)
        except_norm_list = [param.name for param in lookup_table.params]
        updates = sgd_optimizer.get_update(loss, params, except_norm_list)

        self.train_x = shared_zero_matrix((batch_size, 1), dtype=np.int32)
        self.train_y = shared_zero_matrix(1, dtype=np.int32)
        self.dev_x = shared_zero_matrix((batch_size, 1), dtype=np.int32)
        self.test_x = shared_zero_matrix((batch_size, 1), dtype=np.int32)

        index = T.ivector()
        self.train_batch = theano.function(inputs=[index],
                                           outputs=[classifier_output, loss],
                                           updates=updates,
                                           givens={
                                               word_index: self.train_x[index],
                                               gold_truth: self.train_y[index]
                                           })
        self.get_norm = theano.function(
            inputs=[], outputs=[lookup_table.l2_norm, classifier.l2_norm])
        self.pred_train_batch = theano.function(
            inputs=[index],
            outputs=classifier_output,
            givens={word_index: self.train_x[index]})
        self.pred_dev_batch = theano.function(
            inputs=[index],
            outputs=classifier_output,
            givens={word_index: self.dev_x[index]})
        self.pred_test_batch = theano.function(
            inputs=[index],
            outputs=classifier_output,
            givens={word_index: self.test_x[index]})
コード例 #15
0
    def __init__(self,
                 lookup_table,
                 in_dim,
                 hidden_dims,
                 labels_nums,
                 activation,
                 highway=False,
                 batch_size=64,
                 initializer=default_initializer,
                 optimizer=None,
                 dropout=0,
                 verbose=True):
        self.batch_size = batch_size
        self.num_task = len(labels_nums)
        word_index = T.itensor3()  # (batch, max_len)
        gold_truth = T.ivector()  # (batch, 1)

        mask_query = (word_index > 0) * T.constant(1,
                                                   dtype=theano.config.floatX)
        mask_user = (T.sum(word_index, axis=2) > 0) * T.constant(
            1, dtype=theano.config.floatX)
        word_embedding = lookup_table.W[word_index]
        # max sum averaging
        hidden = get_pooling_batch_word(word_embedding, mask_query,
                                        "averaging")
        hidden = get_pooling_batch(hidden, mask_user, "averaging")
        # hidden = T.mean(hidden, axis=1)
        if len(hidden_dims) == 0 or hidden_dims[0] == 0:
            nn_output = hidden
            nn_output_dim = in_dim
        elif highway:
            encoder = HighwayLayer(in_dim=in_dim,
                                   activation=activation,
                                   initializer=initializer,
                                   dropout=dropout,
                                   verbose=verbose)
            nn_output = encoder.forward_batch(hidden)
            nn_output_dim = encoder.out_dim
        else:
            encoder = MultiHiddenLayer(in_dim=in_dim,
                                       hidden_dims=hidden_dims,
                                       activation=activation,
                                       initializer=initializer,
                                       dropout=dropout,
                                       verbose=verbose)
            nn_output = encoder.forward_batch(hidden)
            nn_output_dim = encoder.out_dim
        if optimizer is None:
            sgd_optimizer = AdaGradOptimizer(lr=0.95, norm_lim=16)
        else:
            sgd_optimizer = optimizer
        self.train_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32)
        self.train_y = shared_zero_matrix((1, 1), dtype=np.int32)
        self.dev_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32)
        self.test_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32)
        self.train_batch_list = list()
        self.pred_train_batch_list = list()
        self.pred_dev_batch_list = list()
        self.pred_test_batch_list = list()
        self.get_y_list = list()
        index = T.ivector()
        classifier_list = list()
        classifier_output_list = list()
        classifier_loss_list = list()
        classifier_param_list = list()
        classifier_updates_list = list()
        for i in xrange(len(labels_nums)):
            classifier = SoftmaxClassifier(num_in=nn_output_dim,
                                           num_out=labels_nums[i],
                                           initializer=initializer)
            classifier_list.append(classifier)
            classifier_output_list.append(
                classifier_list[i].forward(nn_output))
            classifier_loss_list.append(classifier_list[i].loss(
                nn_output, gold_truth))
            if len(hidden_dims) == 0 or hidden_dims[0] == 0:
                classifier_param_list.append(lookup_table.params +
                                             classifier.params)
            else:
                classifier_param_list.append(lookup_table.params +
                                             classifier.params +
                                             encoder.params)
            except_norm_list = [param.name for param in lookup_table.params]
            classifier_updates_list.append(
                sgd_optimizer.get_update(classifier_loss_list[i],
                                         classifier_param_list[i],
                                         except_norm_list))
            train_batch = theano.function(
                inputs=[index],
                outputs=[classifier_output_list[i], classifier_loss_list[i]],
                updates=classifier_updates_list[i],
                givens={
                    word_index: self.train_x[index],
                    gold_truth: self.train_y[index, i]
                })
            self.train_batch_list.append(train_batch)
            pred_train_batch = theano.function(
                inputs=[index],
                outputs=classifier_output_list[i],
                givens={word_index: self.train_x[index]})
            self.pred_train_batch_list.append(pred_train_batch)
            pred_dev_batch = theano.function(
                inputs=[index],
                outputs=classifier_output_list[i],
                givens={word_index: self.dev_x[index]})
            self.pred_dev_batch_list.append(pred_dev_batch)
            pred_test_batch = theano.function(
                inputs=[index],
                outputs=classifier_output_list[i],
                givens={word_index: self.test_x[index]})
            self.pred_test_batch_list.append(pred_test_batch)
            self.get_y_list.append(
                theano.function(inputs=[index], outputs=self.train_y[index,
                                                                     i]))
コード例 #16
0
    def __init__(self,
                 key_index,
                 label_num,
                 pretrain_name=None,
                 encoder='lstm',
                 word_dim=300,
                 hidden='100_100',
                 dropout=0.5,
                 regularization_weight=0.0001,
                 optimizer_name='adagrad',
                 lr=0.1,
                 norm_lim=-1,
                 label2index_filename=None):
        self.label2index, self.index2label = self.load_label_index(
            label2index_filename, label_num)

        self.indexs = T.imatrix()  # (batch, max_len)
        self.golden = T.ivector()  # (batch, )
        self.max_len = T.iscalar()  # max length

        self.s1_mask = self.indexs[:, :self.max_len] > 0
        self.s1_mask = self.s1_mask * T.constant(1.0,
                                                 dtype=theano.config.floatX)

        if pretrain_name is None:
            self.embedding = WordEmbedding(
                key_index,
                dim=word_dim,
                initializer=UniformInitializer(scale=0.01))
        else:
            self.embedding = WordEmbedding(key_index,
                                           filename=pretrain_name,
                                           normalize=False,
                                           binary=True)
            assert self.embedding.dim == word_dim

        self.word_embeddings = self.embedding[self.indexs[:, :self.max_len]]

        if type(hidden) is str:
            hidden_dims = [int(hid) for hid in hidden.split('_')]
        else:
            hidden_dims = [hidden]

        if encoder == 'lstm':
            encoder_layer = LSTMEncoder(in_dim=word_dim,
                                        hidden_dim=hidden_dims[0],
                                        pooling='final',
                                        prefix="LSTM_",
                                        dropout=dropout)
        elif encoder == 'bilstm':
            encoder_layer = BiLSTMEncoder(in_dim=word_dim,
                                          hidden_dim=hidden_dims[0],
                                          pooling='final',
                                          prefix="BiLSTM_",
                                          bidirection_shared=True,
                                          dropout=dropout)
        elif encoder == 'recurrent':
            encoder_layer = RecurrentEncoder(in_dim=word_dim,
                                             hidden_dim=hidden_dims[0],
                                             pooling='final',
                                             prefix="Recurrent_",
                                             dropout=dropout)
        elif encoder == 'birecurrent':
            encoder_layer = BiRecurrentEncoder(in_dim=word_dim,
                                               hidden_dim=hidden_dims[0],
                                               pooling='final',
                                               prefix="BiRecurrent_",
                                               bidirection_shared=True,
                                               dropout=dropout)
        elif encoder == 'gru':
            encoder_layer = GRUEncoder(in_dim=word_dim,
                                       hidden_dim=hidden_dims[0],
                                       pooling='final',
                                       prefix="GRU_",
                                       dropout=dropout)
        elif encoder == 'bigru':
            encoder_layer = BiGRUEncoder(in_dim=word_dim,
                                         hidden_dim=hidden_dims[0],
                                         pooling='final',
                                         prefix="BiGRU_",
                                         bidirection_shared=True,
                                         dropout=dropout)
        elif encoder == 'cbow':
            encoder_layer = CBOWLayer(in_dim=word_dim, )
        elif encoder == 'cnn':
            encoder_layer = MultiFilterConvolutionLayer(
                in_dim=word_dim,
                hidden_dim=hidden_dims[0],
                pooling='max',
                prefix="ConvLayer_",
                kernel_sizes=CONV_FILTER_SIZES)
        else:
            raise NotImplementedError

        self.text_embedding = encoder_layer.forward_batch(
            self.word_embeddings, self.s1_mask)

        if len(hidden_dims) > 1:
            hidden_layer = MultiHiddenLayer(in_dim=encoder_layer.out_dim,
                                            hidden_dims=hidden_dims[1:],
                                            dropout=dropout,
                                            prefix='Full_Connected_Layer_')
            classifier_input = hidden_layer.forward_batch(self.text_embedding)
            classifier_input_dim = hidden_layer.out_dim
        else:
            classifier_input = self.text_embedding
            classifier_input_dim = encoder_layer.out_dim

        self.classifier = SoftmaxClassifier(classifier_input_dim,
                                            label_num,
                                            dropout=dropout)
        self.predict_loss = self.classifier.loss(classifier_input, self.golden)
        self.predict_prob = self.classifier.forward_batch(classifier_input)
        self.predict_label = T.argmax(self.predict_prob, axis=1)
        """Params in TextClassifier"""
        self.params = self.classifier.params + encoder_layer.params
        self.l2_norm = self.classifier.l2_norm + encoder_layer.l2_norm
        if len(hidden_dims) > 1:
            self.params += hidden_layer.params
            self.l2_norm += hidden_layer.l2_norm

        self.l2_loss = regularization_weight * self.l2_norm / 2
        self.loss = self.predict_loss + self.l2_loss
        """Opimizer and Loss"""
        if optimizer_name == 'adagrad':
            sgd_optimizer = AdaGradOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'adadelta':
            sgd_optimizer = AdaDeltaOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'sgd':
            sgd_optimizer = SGDOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'momentum':
            sgd_optimizer = SGDMomentumOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'adam':
            sgd_optimizer = AdamOptimizer(lr=lr, norm_lim=norm_lim)
        else:
            raise NotImplementedError

        self.train_indexs = T.ivector()
        self.train_data_x = shared_zero_matrix(shape=(5, 5),
                                               name="train_data_x",
                                               dtype=np.int32)
        self.train_data_y = shared_zero_matrix(shape=(5, ),
                                               name="train_data_y",
                                               dtype=np.int32)

        self.model_params = self.params + self.embedding.params
        """Theano Function"""
        if EMBEDDING_LR > 0:
            embedding_updates = SGDOptimizer(lr=EMBEDDING_LR,
                                             norm_lim=-1).get_update(
                                                 self.loss,
                                                 self.embedding.params)
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)
            updates.update(embedding_updates)
        elif EMBEDDING_LR < 0:
            # Optimize Embedding using Global Optimizer
            self.params += self.embedding.params
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)
        else:
            # Fix Embedding
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)

        self.train_batch = theano.function(
            inputs=[self.train_indexs, self.max_len],
            outputs=[self.loss, self.predict_loss, self.l2_loss],
            updates=updates,
            givens=[(self.indexs, self.train_data_x[self.train_indexs]),
                    (self.golden, self.train_data_y[self.train_indexs])])

        self.loss_batch = theano.function(
            inputs=[self.indexs, self.golden, self.max_len],
            outputs=[self.loss, self.predict_loss, self.l2_loss],
        )

        self.pred_prob_batch = theano.function(
            inputs=[self.indexs, self.max_len],
            outputs=[self.predict_prob],
        )

        self.pred_label_batch = theano.function(
            inputs=[self.indexs, self.max_len],
            outputs=[self.predict_label],
        )

        self.get_l2_loss = theano.function(
            inputs=[],
            outputs=[self.l2_loss, self.l2_norm],
        )