Beispiel #1
0
    def __init__(self, n_tokens, n_cells, db, emb, max_gen=10):
        self.n_tokens = n_tokens
        self.n_cells = n_cells
        self.max_gen = max_gen

        self.db = db
        self.emb = emb

        emb_dim = emb.size()
        self.input_rnn = LSTM(n_in=emb_dim, n_out=n_cells)
        self.output_rnn = LSTM(n_in=emb_dim, n_out=n_cells)
        self.output_rnn_clf = Sequential(
            [LinearLayer(n_in=n_cells, n_out=n_tokens),
             Softmax()])
        self.output_switch_p = Sequential(
            [LinearLayer(n_in=n_cells, n_out=1),
             Sigmoid()])
        self.att = Attention(n_hidden=n_cells)

        self.param_layers, self.param_layers_names = zip(*[
            (self.output_switch_p, 'switch'),
            (self.output_rnn_clf, 'out_rnn_clf'),
            (self.output_rnn, 'out_rnn'),
            (self.att, 'att'),
            (self.input_rnn, 'in_rnn'),
        ])

        self.print_widths = defaultdict(dict)

        self.parametrize_from_layers(self.param_layers,
                                     self.param_layers_names)
Beispiel #2
0
    def ready(self, args, train):
        # len * batch
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = T.matrix(dtype=theano.config.floatX)

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(n_d=self.n_d,
                                         vocab=set(w for w in train))
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d))

        activation = get_activation_by_name(args["activation"])

        rnn_layer = LSTM(n_in=self.n_d, n_out=self.n_d, activation=activation)

        output_layer = Layer(
            n_in=self.n_d,
            n_out=self.n_V,
            activation=T.nnet.softmax,
        )

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d))

        # len * batch * (n_d+n_d)
        h = rnn_layer.forward_all(x, self.init_state, return_c=True)

        self.last_state = h[-1]
        h = h[:, :, self.n_d:]
        h = apply_dropout(h, self.dropout)

        self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys])
        #self.nll = T.nnet.categorical_crossentropy(
        #                self.p_y_given_x,
        #                idys
        #            )

        self.layers = [embedding_layer, rnn_layer, output_layer]
        #self.params = [ x_flat ] + rnn_layer.params + output_layer.params
        self.params = embedding_layer.params + rnn_layer.params + output_layer.params
        self.num_params = sum(
            len(x.get_value(borrow=True).ravel()) for l in self.layers
            for x in l.params)
        say("# of params in total: {}\n".format(self.num_params))
Beispiel #3
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
	num_aspects = self.num_aspects        

        self.n_emb = embedding_layer.n_d
        
	dropout = self.dropout = theano.shared(
                np.float64(args.dropout_rate).astype(theano.config.floatX)
            )

        self.x = T.imatrix('x')
	self.w_masks = T.fmatrix('mask')
	self.w_lens = T.fvector('sent_len')
	self.s_maxlen = T.iscalar('sent_max_len')
	self.s_num = T.iscalar('sent_num')
	self.y = T.ivector('y')
	self.ay = T.imatrix('ay')
	self.ay_mask = T.fmatrix('ay_mask')	
	self.aay = T.itensor3('aay')

        x = self.x
	query = self.query
        
	w_masks = self.w_masks
	w_lens = self.w_lens
	s_ml = self.s_maxlen
	s_num = self.s_num
	n_emb = self.n_emb
	
	y = self.y
        ay = self.ay
	ay_mask = self.ay_mask
	aay = self.aay

	layers = self.layers = [embedding_layer]
        slices  = embedding_layer.forward(x.ravel())
	self.slices = slices = slices.reshape( (x.shape[0], x.shape[1], n_emb) )
	
	slices_query = embedding_layer.forward(query.flatten(), is_node = False)
	slices_query = slices_query.reshape( (query.shape[0], query.shape[1], n_emb))
	
	layers.append(Query_Repr_Layer(slices_query))
	slices_query_tmp = slices_query = layers[-1].forward()
	
	layer = LSTM(n_in = n_emb, n_out = n_emb)
        layers.append(layer)

	prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        prev_output = layers[-1].forward_all(prev_output, w_masks)

        layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh)
        layers.append(layer)
        self.slices_query = slices_query = layers[-1].forward(slices_query)

	maskss = []
	w_lenss = []
	for i in range(num_aspects):
	    maskss.append(w_masks)
	    w_lenss.append(w_lens)

	maskss = T.concatenate(maskss, axis = 1)
        w_lenss = T.concatenate(w_lenss)

	layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb)
        layers.append(layer)
	prev_output = layers[-1].forward(prev_output, slices_query, is_word = True, hop = args.hop_word, masks = w_masks, aspect_num = num_aspects)
	prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2]))
        prev_output = apply_dropout(prev_output, dropout, v2=True)
	
	prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / (num_aspects * s_num), s_num, prev_output.shape[1]))
	prev_output = prev_output.dimshuffle(2, 0, 1, 3)
	prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[1] * prev_output.shape[2], prev_output.shape[3]))

        layer = LSTM(n_in = n_emb * args.hop_word, n_out = n_emb)
	layers.append(layer)
	prev_output = layers[-1].forward_all(prev_output)
	
	#layers.append(Query_Repr_Layer(slices_query))
        #slices_query = layers[-1].forward()
	layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh)
        layers.append(layer)
        slices_query = layers[-1].forward(slices_query_tmp) # bug
	
	layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb)
        layers.append(layer)
	prev_output = layers[-1].forward(prev_output, slices_query, is_word = False, hop = args.hop_sent, aspect_num = num_aspects)
        prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2]))
	prev_output = apply_dropout(prev_output, dropout, v2=True)

	prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / num_aspects, prev_output.shape[1]))
	
	softmax_inputs = []
	for i in range(num_aspects):
	    softmax_inputs.append(prev_output[i])
	
	size = n_emb * args.hop_sent
	
	p_y_given_a = []
	pred_ay = []
	nll_loss_ay = []
	
	for i in range(num_aspects):
	    layers.append(Layer(n_in = size,
                    n_out = args.score_scale,
                    activation = softmax,
                    has_bias = False,))

	    p_y_given_a.append(layers[-1].forward(softmax_inputs[i]))
	    nll_loss_ay.append( T.mean(T.sum( -T.log(p_y_given_a[-1]) * aay[:, i, :] * ay_mask[:, i].dimshuffle(0, 'x'))))
	    pred_ay.append(T.argmax(p_y_given_a[-1], axis = 1))

	self.p_y_given_a = p_y_given_a
	self.nll_loss_ay = T.sum(nll_loss_ay)
	self.pred_ay = T.stack(pred_ay).dimshuffle(1, 0)
        
	for l,i in zip(layers[4:], range(len(layers[3:]))):
            say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
           ))
	
	self.l2_sqr = None
        self.params = [ ]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Beispiel #4
0
    def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim,
              word_lstm_dim, word_bidirect, pos_dim, pos_lstm_dim, lr_method,
              lr_rate, clip_norm, crf, is_train, **kwargs):
        """
        建立网络
        """
        # 各变量的种类数
        n_words = len(self.id_to_word)
        n_pos_tags = len(self.id_to_pos)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # 网络变量
        self.word_ids = tf.placeholder(
            tf.int32, shape=[None, None],
            name='word_ids')  # 词的数字索引 shape:[batch_size, max_word_len]
        self.word_pos_ids = tf.placeholder(
            tf.int32, shape=[None],
            name='word_pos_ids')  # 词的位置索引 shape: [batch_size]
        self.pos_ids = tf.placeholder(
            tf.int32, shape=[None, None],
            name='pos_ids')  # 词性标签的数字索引 shape:[batch_size, max_pos_len]
        self.char_for_ids = tf.placeholder(
            tf.int32, shape=[None, None, None], name='char_for_ids'
        )  # 字符的前向的数字索引 shape: [batch_size, word_max_len, char_max_len]
        self.char_rev_ids = tf.placeholder(
            tf.int32, shape=[None, None, None], name='char_rev_ids'
        )  # 字符的后向的数字索引 shape: [batch_size, word_max_len, char_max_len]
        self.char_pos_ids = tf.placeholder(
            tf.int32, shape=[None, None], name='char_pos_ids'
        )  # 字符的位置索引 shape: [batch_size*word_max_len, char_max_len]
        self.tag_ids = tf.placeholder(
            tf.int32, shape=[None, None],
            name='tag_ids')  # NER标签的数字索引 shape: [batch_size,word_max_len]
        self.tag_id_trans = tf.placeholder(
            tf.int32, shape=[None, None, None], name='tag_id_trans'
        )  # NER标签的转移矩阵的索引 shape: [batch_size,word_max_len+1,2]
        self.tag_id_index = tf.placeholder(
            tf.int32, shape=[None, None, None],
            name='tag_id_index')  # shape: [batch_size,word_max_len,2]

        # 最终输出 (所有词的特征)
        input_dim = 0
        inputs = []

        #
        # 词的输入向量
        #
        if word_dim:
            input_dim += word_dim
            with tf.device("/cpu:0"):
                word_layer = EmbeddingLayer(n_words,
                                            word_dim,
                                            name='word_layer')
                word_input = word_layer.link(self.word_ids)
                inputs.append(word_input)

        #
        # 词性标注的输入向量
        #
        if pos_dim:
            input_dim += pos_dim
            with tf.device("/cpu:0"):
                pos_layer = EmbeddingLayer(n_pos_tags,
                                           pos_dim,
                                           name='pos_layer')
                pos_input = pos_layer.link(self.pos_ids)
                inputs.append(pos_input)

        #
        # 字符的输入向量
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            with tf.device("/cpu:0"):
                char_for_embedding_batch = char_layer.link(self.char_for_ids)
                char_rev_embedding_batch = char_layer.link(self.char_rev_ids)
            shape_for = tf.shape(char_for_embedding_batch)
            # reshape from [batch_size, word_max_len, char_max_len, char_dim] to [batch_size*word_max_len, char_max_len, char_dim]
            char_for_embedding = tf.reshape(
                char_for_embedding_batch,
                (shape_for[0] * shape_for[1], shape_for[2], shape_for[3]))
            shape_rev = tf.shape(char_rev_embedding_batch)
            char_rev_embedding = tf.reshape(
                char_rev_embedding_batch,
                (shape_rev[0] * shape_rev[1], shape_rev[2], shape_rev[3]))
            char_lstm_for_states = char_lstm_for.link(char_for_embedding)
            char_lstm_rev_states = char_lstm_rev.link(char_rev_embedding)
            char_lstm_for_h_trans = tf.transpose(char_lstm_for_states[1],
                                                 (1, 0, 2),
                                                 name='char_lstm_for_h_trans')
            char_lstm_rev_h_trans = tf.transpose(char_lstm_rev_states[1],
                                                 (1, 0, 2),
                                                 name='char_lstm_rev_h_trans')
            char_for_output = tf.gather_nd(char_lstm_for_h_trans,
                                           self.char_pos_ids,
                                           name='char_for_output')
            char_rev_output = tf.gather_nd(char_lstm_rev_h_trans,
                                           self.char_pos_ids,
                                           name='char_rev_output')
            char_for_output_batch = tf.reshape(
                char_for_output, (shape_for[0], shape_for[1], char_lstm_dim))
            char_rev_output_batch = tf.reshape(
                char_rev_output, (shape_rev[0], shape_rev[1], char_lstm_dim))
            inputs.append(char_for_output_batch)
            if char_bidirect:
                inputs.append(char_rev_output_batch)
                input_dim += char_lstm_dim

        inputs = tf.concat(inputs, axis=-1)

        # 在最终输出上加Dropout层
        assert dropout < 1 and 0.0 <= dropout
        if dropout:
            input_train = tf.nn.dropout(inputs, 1 - dropout)
            if is_train:
                inputs = input_train
        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=True,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=True,
                             name='word_lstm_rev')
        # 前向隐藏层的输出
        word_states_for = word_lstm_for.link(inputs)
        word_lstm_for_output = tf.transpose(word_states_for[1], (1, 0, 2),
                                            name='word_lstm_for_h_trans')

        # 后向隐藏层的输出
        inputs_rev = tf.reverse_sequence(inputs,
                                         self.word_pos_ids,
                                         seq_dim=1,
                                         batch_dim=0)
        word_states_rev = word_lstm_rev.link(inputs_rev)
        word_lstm_rev_h_trans = tf.transpose(word_states_rev[1], (1, 0, 2),
                                             name='word_lstm_rev_h_trans')
        word_lstm_rev_output = tf.reverse_sequence(word_lstm_rev_h_trans,
                                                   self.word_pos_ids,
                                                   seq_dim=1,
                                                   batch_dim=0)
        if word_bidirect:
            final_output = tf.concat(
                [word_lstm_for_output, word_lstm_rev_output], axis=-1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_lstm_for_output
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer')
        tags_scores = final_layer.link(final_output)

        if not crf:
            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.tag_ids, logits=tags_scores, name='xentropy')
            cost = tf.reduce_mean(cross_entropy, name='xentropy_mean')
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)

            # for batch observation
            #def recurrence(prev, obs):
            #    s_len = tf.shape(obs)[0]
            #    obvs = tf.concat([obs, small * tf.ones((s_len, 2))], axis=1)
            #    observations = tf.concat([b_s, obvs, e_s], axis=0)
            #    return observations
            #tags_scores_shape = tf.shape(tags_scores)
            #obs_initial = tf.ones((tags_scores_shape[1] + 2, n_tags + 2))
            #obs_batch = tf.scan(fn=recurrence, elems=tags_scores, initializer=obs_initial)

            # 计算标签的分数
            def recurrence_real_score(prev, obs):
                tags_score = obs[0]
                tag_id_index_ = obs[1]
                tag_id_trans_ = obs[2]
                word_pos_ = obs[3] + 1
                tags_score_slice = tags_score[0:word_pos_, :]
                tag_id_index_slice = tag_id_index_[0:word_pos_, :]
                tag_id_trans_slice = tag_id_trans_[0:(word_pos_ + 1), :]
                real_path_score = tf.reduce_sum(
                    tf.gather_nd(tags_score_slice, tag_id_index_slice))
                real_path_score += tf.reduce_sum(
                    tf.gather_nd(transitions, tag_id_trans_slice))
                return tf.reshape(real_path_score, [])

            real_path_score_list = tf.scan(fn=recurrence_real_score,
                                           elems=[
                                               tags_scores, self.tag_id_index,
                                               self.tag_id_trans,
                                               self.word_pos_ids
                                           ],
                                           initializer=0.0)

            def recurrence_all_path(prev, obs):
                tags_score = obs[0]
                word_pos_ = obs[1] + 1
                tags_score_slice = tags_score[0:word_pos_, :]
                s_len = tf.shape(tags_score_slice)[0]
                obvs = tf.concat(
                    [tags_score_slice, small * tf.ones((s_len, 2))], axis=1)
                observations = tf.concat([b_s, obvs, e_s], axis=0)
                all_paths_scores = forward(observations, transitions)
                return tf.reshape(all_paths_scores, [])

            all_paths_scores_list = tf.scan(
                fn=recurrence_all_path,
                elems=[tags_scores, self.word_pos_ids],
                initializer=0.0)

            cost = -tf.reduce_mean(real_path_score_list -
                                   all_paths_scores_list)
        # 网络参数
        if not crf:
            f_score = tf.nn.softmax(tags_scores)
        else:

            def recurrence_predict(prev, obs):
                tags_score = obs[0]
                word_pos_ = obs[1] + 1
                tags_score_slice = tags_score[0:word_pos_, :]
                s_len = tf.shape(tags_score_slice)[0]
                obvs = tf.concat(
                    [tags_score_slice, small * tf.ones((s_len, 2))], axis=1)
                observations = tf.concat([b_s, obvs, e_s], axis=0)
                all_paths_scores = forward(observations,
                                           transitions,
                                           viterbi=True,
                                           return_alpha=False,
                                           return_best_sequence=True)
                all_paths_scores = tf.concat([
                    all_paths_scores,
                    tf.zeros([tf.shape(tags_score)[0] - s_len], tf.int32)
                ],
                                             axis=0)
                return all_paths_scores

            f_score = tf.scan(fn=recurrence_predict,
                              elems=[tags_scores, self.word_pos_ids],
                              initializer=tf.zeros(
                                  [tf.shape(tags_scores)[1] + 2], tf.int32))

        # 选择优化方法
        tvars = tf.trainable_variables()
        grads = tf.gradients(cost, tvars)
        if clip_norm > 0:
            grads, _ = tf.clip_by_global_norm(grads, clip_norm)

        if lr_method == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(lr_rate)
        elif lr_method == 'adagrad':
            optimizer = tf.train.AdagradOptimizer(lr_rate)
        elif lr_method == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(lr_rate)
        elif lr_method == 'adam':
            optimizer = tf.train.AdamOptimizer(lr_rate)
        elif lr_method == 'rmsprop':
            optimizer = tf.train.RMSPropOptimizer(lr_rate)
        else:
            raise ("Not implemented learning method: %s" % lr_method)

        train_op = optimizer.apply_gradients(zip(grads, tvars))

        # Tensorboard可视化cost的趋势
        tf.summary.scalar('loss', cost)

        return cost, f_score, train_op
Beispiel #5
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              model_type,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        layer_weighting = "fixed"
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)

        print "-------------------------------MODEL INFO---------------------------------------"
        print "** model_type", model_type
        print "** n_words, n_chars:", n_words, n_chars
        print "** self.feature_maps:"
        for f in self.feature_maps:
            print f["name"], f
        print "** self.tag_maps:"
        for tm in self.tag_maps:
            print tm
        print "---------------------------------------------------------------------------------"

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')

        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')

        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        features_ids = []
        for f in self.feature_maps:
            features_ids.append(T.ivector(name=f['name'] + '_ids'))

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            print "** input_dim (input_dim += word_dim)", input_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            print "** input_dim (input_dim += char_lstm_dim)", input_dim

            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
                print "** input_dim (input_dim += char_lstm_dim: char_bidirect)", input_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            print "** input_dim (input_dim += cap_dim)", input_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        f_layers = []
        for ilayer in range(len(self.feature_maps)):
            f = self.feature_maps[ilayer]
            input_dim += f['dim']
            print "** input_dim (input_dim += f['dim'])", input_dim

            af_layer = EmbeddingLayer(len(f['id_to_ftag']),
                                      f['dim'],
                                      name=f['name'] + '_layer')
            f_layers.append(af_layer)
            inputs.append(af_layer.link(features_ids[ilayer]))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1)
        # inputs_nodropout = inputs

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        assert model_type in {
            "struct", "struct_mlp", "struct_mlp2", "multilayer", "single"
        }

        # Network parameters: Part 1 (Common parameters)
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)

        for af_layer in f_layers:
            self.add_component(af_layer)
            params.extend(af_layer.params)

        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)

        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)

        if model_type == "multilayer" or model_type == "single":
            tags_scores_list = []
            tag_ids_list = []
            cost_list = []

            observations_list = []
            transitions_list = []

            prev_input_dim = input_dim
            prev_ntags = 0
            prev_tags_cores = None
            previous_inputs = inputs

            for ilayer in range(len(self.tag_maps)):
                inputs_i = previous_inputs if prev_tags_cores == None else T.concatenate(
                    [previous_inputs, prev_tags_cores], axis=1)
                previous_inputs = inputs_i
                input_dim_i = prev_input_dim + prev_ntags
                print "input_dim_i for layer %d: %d" % (ilayer, input_dim_i)

                word_lstm_for_i = LSTM(input_dim_i,
                                       word_lstm_dim,
                                       with_batch=False,
                                       name='word_lstm_for' + str(ilayer))
                word_lstm_rev_i = LSTM(input_dim_i,
                                       word_lstm_dim,
                                       with_batch=False,
                                       name='word_lstm_rev' + str(ilayer))
                word_lstm_for_i.link(inputs_i)
                word_lstm_rev_i.link(inputs_i[::-1, :])
                word_for_output_i = word_lstm_for_i.h
                word_rev_output_i = word_lstm_rev_i.h[::-1, :]

                if word_bidirect:
                    final_output_i = T.concatenate(
                        [word_for_output_i, word_rev_output_i], axis=1)
                    tanh_layer_i = HiddenLayer(2 * word_lstm_dim,
                                               word_lstm_dim,
                                               name='tanh_layer' + str(ilayer),
                                               activation='tanh')
                    final_output_i = tanh_layer_i.link(final_output_i)
                else:
                    final_output_i = word_for_output_i

                n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])

                final_layer_i = HiddenLayer(
                    word_lstm_dim,
                    n_tags_i,
                    name='final_layer' + str(ilayer),
                    activation=(None if crf else 'softmax'))
                tags_scores_i = final_layer_i.link(final_output_i)
                tag_ids_i = T.ivector(name='tag_ids' +
                                      str(ilayer))  # input tags of layer i

                # No CRF
                if not crf:
                    cost_i = T.nnet.categorical_crossentropy(
                        tags_scores_i, tag_ids_i).mean()
                # CRF
                else:
                    transitions_i = shared((n_tags_i + 2, n_tags_i + 2),
                                           'transitions' + str(ilayer))
                    small1 = -1000
                    b_s1 = np.array([[small1] * n_tags_i + [0, small1]
                                     ]).astype(np.float32)

                    e_s1 = np.array([[small1] * n_tags_i + [small1, 0]
                                     ]).astype(np.float32)

                    observations_i = T.concatenate(
                        [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
                    observations_i = T.concatenate(
                        [b_s1, observations_i, e_s1], axis=0)

                    # Score from tags
                    real_path_score1 = tags_scores_i[T.arange(s_len),
                                                     tag_ids_i].sum()

                    # Score from transitions
                    b_id1 = theano.shared(
                        value=np.array([n_tags_i], dtype=np.int32))
                    e_id1 = theano.shared(
                        value=np.array([n_tags_i + 1], dtype=np.int32))
                    padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1],
                                                     axis=0)
                    real_path_score1 += transitions_i[
                        padded_tags_ids1[T.arange(s_len + 1)],
                        padded_tags_ids1[T.arange(s_len + 1) + 1]].sum()

                    all_paths_scores1 = forward(observations_i, transitions_i)

                    cost_i = -(real_path_score1 - all_paths_scores1)

                    observations_list.append(observations_i)
                    transitions_list.append(transitions_i)

                prev_input_dim = input_dim_i
                prev_ntags = n_tags_i
                prev_tags_cores = tags_scores_i * 1

                cost_list.append(cost_i)  # add cost of layer i into cost list
                tags_scores_list.append(tags_scores_i)
                tag_ids_list.append(tag_ids_i)

                # Network parameters: Part 2 (add parameters of mutilayer architectures)

                self.add_component(word_lstm_for_i)
                params.extend(word_lstm_for_i.params)  #1

                if word_bidirect:
                    self.add_component(word_lstm_rev_i)
                    params.extend(word_lstm_rev_i.params)  #2

                self.add_component(final_layer_i)
                params.extend(final_layer_i.params)  #3

                if crf:
                    self.add_component(transitions_i)
                    params.append(transitions_i)  #4

                if word_bidirect:
                    self.add_component(tanh_layer_i)
                    params.extend(tanh_layer_i.params)  #5

            # end for loop

        elif model_type == "struct" or model_type.startswith("struct_mlp"):
            # begin step 1: Using BI-LSTM to encode the sequence

            word_lstm_for = LSTM(input_dim,
                                 word_lstm_dim,
                                 with_batch=False,
                                 name='word_lstm_for')
            word_lstm_rev = LSTM(input_dim,
                                 word_lstm_dim,
                                 with_batch=False,
                                 name='word_lstm_rev')

            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_rev_output = word_lstm_rev.h[::-1, :]
            if word_bidirect:
                lstm_output = T.concatenate([word_for_output, word_rev_output],
                                            axis=1)
                tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                         word_lstm_dim,
                                         name='tanh_layer',
                                         activation='tanh')
                lstm_output = tanh_layer.link(lstm_output)
            else:
                lstm_output = word_for_output

            # end step 1: final_output is the list of hidden states. Shapes of hidden state is

            prev_ntags = 0
            tags_scores_list = []
            prev_tags_cores = None
            final_layer_list = []
            final_output = lstm_output
            mlp_list = []

            if model_type == "struct":
                for ilayer in range(0, len(self.tag_maps)):
                    n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                    final_output = final_output if prev_tags_cores == None else T.concatenate(
                        [final_output, prev_tags_cores], axis=1)
                    final_layer_i = HiddenLayer(
                        word_lstm_dim + prev_ntags,
                        n_tags_i,
                        name='final_layer_' + str(ilayer),
                        activation=(None if crf else 'softmax'))
                    tags_scores_i = final_layer_i.link(final_output)

                    prev_ntags += n_tags_i
                    prev_tags_cores = tags_scores_i
                    tags_scores_list.append(tags_scores_i)
                    final_layer_list.append(final_layer_i)
            elif model_type.startswith("struct_mlp"):

                for ilayer in range(0, len(self.tag_maps)):
                    n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                    final_output = final_output if prev_tags_cores == None else T.concatenate(
                        [final_output, prev_tags_cores], axis=1)

                    if model_type == "struct_mlp2":
                        mlp_sizes = [
                            word_lstm_dim + prev_ntags, word_lstm_dim,
                            word_lstm_dim
                        ]
                    else:
                        mlp_sizes = [word_lstm_dim + prev_ntags, word_lstm_dim]

                    mlp_input = final_output
                    for j in range(len(mlp_sizes) - 1):
                        mlp_layer = HiddenLayer(mlp_sizes[j],
                                                mlp_sizes[j + 1],
                                                name="mlp" + str(j + 1) +
                                                "_layer_" + str(ilayer),
                                                activation="tanh")
                        mlp_input = mlp_layer.link(mlp_input)
                        mlp_list.append(mlp_layer)
                    final_layer_i = HiddenLayer(
                        word_lstm_dim,
                        n_tags_i,
                        name='final_layer_' + str(ilayer),
                        activation=(None if crf else 'softmax'))
                    tags_scores_i = final_layer_i.link(mlp_input)

                    # # unroll version
                    # mlp1_layer_i = HiddenLayer(word_lstm_dim + prev_ntags, word_lstm_dim,
                    #                            name="mlp1_layer_" + str(ilayer), activation="tanh")
                    # mlp1_layer_i_out = mlp1_layer_i.link(final_output)
                    #
                    # mlp2_layer_i = HiddenLayer(word_lstm_dim, word_lstm_dim,
                    #                            name="mlp2_layer_" + str(ilayer), activation="tanh")
                    # mlp2_layer_i_out = mlp2_layer_i.link(mlp1_layer_i_out)
                    # mlp_list.append(mlp1_layer_i)
                    # mlp_list.append(mlp2_layer_i)
                    #
                    # final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer_' + str(ilayer),
                    #                             activation=(None if crf else 'softmax'))
                    # tags_scores_i = final_layer_i.link(mlp2_layer_i_out)

                    prev_ntags += n_tags_i
                    prev_tags_cores = tags_scores_i
                    tags_scores_list.append(tags_scores_i)
                    final_layer_list.append(final_layer_i)
            else:
                print(model_type, " is not exits !")
                raise

            # # unroll code
            # n_tags_0 = len(self.tag_maps[0]['id_to_tag'])
            # final_layer_0 = HiddenLayer(word_lstm_dim, n_tags_0, name='final_layer_0', activation=(None if crf else 'softmax'))
            # tags_scores_0 = final_layer_0.link(final_output)
            #
            # n_tags_1 = len(self.tag_maps[1]['id_to_tag'])
            # final_layer_1 = HiddenLayer(word_lstm_dim + n_tags_0, n_tags_1, name='final_layer_1', activation=(None if crf else 'softmax'))
            # final_output = T.concatenate( [final_output, tags_scores_0], axis=1 )
            # tags_scores_1 = final_layer_1.link(final_output)
            #
            # n_tags_2 = len(self.tag_maps[2]['id_to_tag'])
            # final_layer_2 = HiddenLayer(word_lstm_dim + n_tags_0 + n_tags_1, n_tags_2, name='final_layer_2',
            #                         activation=(None if crf else 'softmax'))
            # final_output = T.concatenate([final_output, tags_scores_1], axis=1)
            # tags_scores_2 = final_layer_2.link(final_output)
            # tags_scores_list = [tags_scores_0, tags_scores_1, tags_scores_2]

            tag_ids_list = []
            observations_list = []
            transitions_list = []
            cost_list = []

            for ilayer in range(0, len(self.tag_maps)):
                tag_ids_i = T.ivector(name='tag_ids' +
                                      str(ilayer))  # input tags
                tag_ids_list.append(tag_ids_i)
                tags_scores_i = tags_scores_list[ilayer]
                n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                # No CRF
                if not crf:
                    cost_i = T.nnet.categorical_crossentropy(
                        tags_scores_i, tag_ids_i).mean()
                # CRF
                else:
                    transitions_i = shared((n_tags_i + 2, n_tags_i + 2),
                                           'transitions' + str(ilayer))
                    small1 = -1000
                    b_s1 = np.array([[small1] * n_tags_i + [0, small1]
                                     ]).astype(np.float32)
                    e_s1 = np.array([[small1] * n_tags_i + [small1, 0]
                                     ]).astype(np.float32)
                    observations_i = T.concatenate(
                        [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
                    observations_i = T.concatenate(
                        [b_s1, observations_i, e_s1], axis=0)

                    # Score from tags
                    real_path_score1 = tags_scores_i[T.arange(s_len),
                                                     tag_ids_i].sum()

                    # Score from transitions
                    b_id1 = theano.shared(
                        value=np.array([n_tags_i], dtype=np.int32))
                    e_id1 = theano.shared(
                        value=np.array([n_tags_i + 1], dtype=np.int32))
                    padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1],
                                                     axis=0)
                    real_path_score1 += transitions_i[
                        padded_tags_ids1[T.arange(s_len + 1)],
                        padded_tags_ids1[T.arange(s_len + 1) + 1]].sum()

                    all_paths_scores1 = forward(observations_i, transitions_i)

                    cost_i = -(real_path_score1 - all_paths_scores1)

                    observations_list.append(observations_i)
                    transitions_list.append(transitions_i)

                cost_list.append(cost_i)  # add cost of layer i into cost list

            # Network parameters: Part 2 (add parameters of struct architectures)

            self.add_component(word_lstm_for)
            params.extend(word_lstm_for.params)

            if word_bidirect:
                self.add_component(word_lstm_rev)
                params.extend(word_lstm_rev.params)

            for mlp_layer in mlp_list:
                self.add_component(mlp_layer)
                params.extend(mlp_layer.params)

            for final_layer in final_layer_list:
                self.add_component(final_layer)
                params.extend(final_layer.params)

            # # unroll code
            # self.add_component(final_layer_0)
            # params.extend(final_layer_0.params)
            #
            # self.add_component(final_layer_1)
            # params.extend(final_layer_1.params)
            #
            # self.add_component(final_layer_2)
            # params.extend(final_layer_2.params)

            if crf:
                for transitions in transitions_list:
                    self.add_component(transitions)
                    params.append(transitions)

            if word_bidirect:
                self.add_component(tanh_layer)
                params.extend(tanh_layer.params)

        # elif model_type == "multilayer_original":
        #     print "** input_dim FOR LAYER 0 ", input_dim
        #     # LSTM for words
        #     word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
        #                          name='word_lstm_for')
        #     word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
        #                          name='word_lstm_rev')
        #
        #     word_lstm_for.link(inputs)
        #     word_lstm_rev.link(inputs[::-1, :])
        #     word_for_output = word_lstm_for.h
        #     word_rev_output = word_lstm_rev.h[::-1, :]
        #     if word_bidirect:
        #         final_output = T.concatenate(
        #             [word_for_output, word_rev_output],
        #             axis=1
        #         )
        #         tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
        #                                  name='tanh_layer', activation='tanh')
        #         final_output = tanh_layer.link(final_output)
        #     else:
        #         final_output = word_for_output
        #
        #     # Sentence to Named Entity tags - Score
        #     n_tags = len(self.tag_maps[0]['id_to_tag'])
        #
        #     final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
        #                               activation=(None if crf else 'softmax'))
        #     tags_scores = final_layer.link(final_output)
        #     tag_ids = T.ivector(name='tag_ids0')  # input tags of layer i
        #
        #     # No CRF
        #     if not crf:
        #         cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        #     # CRF
        #     else:
        #         transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
        #
        #         small = -1000
        #         b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        #         e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        #         observations = T.concatenate(
        #             [tags_scores, small * T.ones((s_len, 2))],
        #             axis=1
        #         )
        #         observations = T.concatenate(
        #             [b_s, observations, e_s],
        #             axis=0
        #         )
        #
        #         # Score from tags
        #         real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
        #
        #         # Score from transitions
        #         b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        #         e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        #         padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
        #         real_path_score += transitions[
        #             padded_tags_ids[T.arange(s_len + 1)],
        #             padded_tags_ids[T.arange(s_len + 1) + 1]
        #         ].sum()
        #
        #         all_paths_scores = forward(observations, transitions)
        #         cost = - (real_path_score - all_paths_scores)
        #
        #     print "cost: ", cost
        #     # Network parameters
        #
        #
        #     self.add_component(word_lstm_for)
        #     params.extend(word_lstm_for.params)  #1
        #
        #     if word_bidirect:
        #         self.add_component(word_lstm_rev)
        #         params.extend(word_lstm_rev.params)  #2
        #
        #     self.add_component(final_layer)
        #     params.extend(final_layer.params)  #3
        #
        #     if crf:
        #         self.add_component(transitions)
        #         params.append(transitions)  #4
        #
        #     if word_bidirect:
        #         self.add_component(tanh_layer)
        #         params.extend(tanh_layer.params)  #5
        #
        #     #
        #     #    layer 1 to n
        #     #
        #     tags_scores_list = [tags_scores]
        #     tag_ids_list = [tag_ids]
        #     cost_list = [cost]
        #     observations_list = [observations]
        #     transitions_list = [transitions]
        #     prev_input_dim = input_dim
        #     prev_ntags = n_tags
        #     prev_tags_cores = tags_scores * 1
        #
        #     for ilayer in range(1, len(self.tag_maps)):
        #         inputs_i = previous_inputs * 1
        #         inputs_i.append(prev_tags_cores)
        #         previous_inputs = inputs_i * 1
        #
        #         inputs_i = T.concatenate(inputs_i, axis=1)
        #         input_dim_i = prev_input_dim + prev_ntags
        #
        #         word_lstm_for_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_for' + str(ilayer))
        #         word_lstm_rev_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_rev' + str(ilayer))
        #         word_lstm_for_i.link(inputs_i)
        #         word_lstm_rev_i.link(inputs_i[::-1, :])
        #         word_for_output_i = word_lstm_for_i.h
        #         word_rev_output_i = word_lstm_rev_i.h[::-1, :]
        #
        #         if word_bidirect:
        #             final_output_i = T.concatenate(
        #                 [word_for_output_i, word_rev_output_i],
        #                 axis=1
        #             )
        #             tanh_layer_i = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
        #                                        name='tanh_layer' + str(ilayer), activation='tanh')
        #             final_output_i = tanh_layer_i.link(final_output_i)
        #         else:
        #             final_output_i = word_for_output_i
        #
        #         n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
        #
        #         final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer' + str(ilayer),
        #                                     activation=(None if crf else 'softmax'))
        #         tags_scores_i = final_layer_i.link(final_output_i)
        #         tags_scores_list.append(tags_scores_i)
        #         tag_ids_i = T.ivector(name='tag_ids' + str(ilayer))  # input tags
        #         tag_ids_list.append(tag_ids_i)
        #
        #         # No CRF
        #         if not crf:
        #             cost_i = T.nnet.categorical_crossentropy(tags_scores_i, tag_ids_i).mean()
        #         # CRF
        #         else:
        #             transitions_i = shared((n_tags_i + 2, n_tags_i + 2), 'transitions' + str(ilayer))
        #             small1 = -1000
        #             b_s1 = np.array([[small1] * n_tags_i + [0, small1]]).astype(np.float32)
        #             e_s1 = np.array([[small1] * n_tags_i + [small1, 0]]).astype(np.float32)
        #             observations_i = T.concatenate([tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
        #             observations_i = T.concatenate([b_s1, observations_i, e_s1], axis=0)
        #
        #             # Score from tags
        #             real_path_score1 = tags_scores_i[T.arange(s_len), tag_ids_i].sum()
        #
        #             # Score from transitions
        #             b_id1 = theano.shared(value=np.array([n_tags_i], dtype=np.int32))
        #             e_id1 = theano.shared(value=np.array([n_tags_i + 1], dtype=np.int32))
        #             padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1], axis=0)
        #             real_path_score1 += transitions_i[
        #                 padded_tags_ids1[T.arange(s_len + 1)],
        #                 padded_tags_ids1[T.arange(s_len + 1) + 1]
        #             ].sum()
        #
        #             all_paths_scores1 = forward(observations_i, transitions_i)
        #
        #             cost_i = - (real_path_score1 - all_paths_scores1)
        #
        #             observations_list.append(observations_i)
        #             transitions_list.append(transitions_i)
        #
        #         prev_input_dim = input_dim_i
        #         prev_ntags = n_tags_i
        #         prev_tags_cores = tags_scores_i * 1
        #         cost_list.append(cost_i)  # add cost of layer i into cost list
        #
        #         # add parameters
        #
        #         self.add_component(word_lstm_for_i)
        #         params.extend(word_lstm_for_i.params)
        #
        #         if word_bidirect:
        #             self.add_component(word_lstm_rev_i)
        #             params.extend(word_lstm_rev_i.params)
        #
        #         self.add_component(final_layer_i)
        #         params.extend(final_layer_i.params)
        #
        #         if crf:
        #             self.add_component(transitions_i)
        #             params.append(transitions_i)
        #
        #         if word_bidirect:
        #             self.add_component(tanh_layer_i)
        #             params.extend(tanh_layer_i.params)
        #
        #     # end for loop

        if layer_weighting == "fixed":
            if len(self.tag_maps) == 2:
                cost_weights = np.array([0.4, 0.6])
            elif len(self.tag_maps) == 3:
                cost_weights = np.array([0.4, 0.3, 0.3])
            else:
                cost_weights = np.ones(
                    (len(self.tag_maps), )) / len(self.tag_maps)
            costall = np.sum(cost_weights * np.array(cost_list))

        else:
            # https://groups.google.com/forum/#!topic/theano-users/XDG6MM83grI
            weights = np.ones((len(self.tag_maps), )) / len(self.tag_maps)
            cost_weights = theano.shared(weights.astype(theano.config.floatX),
                                         name="layer_weights")
            layer_weights = theano.tensor.nnet.sigmoid(cost_weights)
            params.extend([cost_weights])
            xx = theano.tensor.mul(layer_weights,
                                   theano.tensor.as_tensor_variable(cost_list))
            costall = theano.tensor.sum(xx)

        # Prepare train and eval inputs
        eval_inputs = []

        if word_dim:
            eval_inputs.append(word_ids)

        for ilayer in range(len(self.feature_maps)):
            eval_inputs.append(features_ids[ilayer])

        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)

        if cap_dim:
            eval_inputs.append(cap_ids)

        train_inputs = eval_inputs + tag_ids_list

        print "-- train_inputs: ",
        print train_inputs  # [word_ids, pos_ids, chunk_ids, wh_ids, if_ids, s_ids, tag_ids, tag_ids1, tag_ids2]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            # print "train_inputs[9]", train_inputs[9]
            print "-- len(cost_list): ", len(cost_list)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, costall, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=costall,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))

        else:
            f_train = None

        # Compile evaluation function
        tags_scores_out = tags_scores_list
        print "-- len(tags_scores_list): ", len(tags_scores_list)

        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores_out,
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {})  #,
                # on_unused_input='ignore'
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward_n(zip(observations_list, transitions_list),
                                  viterbi=True,
                                  return_alpha=False,
                                  return_best_sequence=True),
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {})  #,
                # on_unused_input='ignore'
            )

        from pprint import pprint
        print "--------------------------------------------------------------"
        pprint(self.components)

        return f_train, f_eval  # return f_train, f_eval, f_test
Beispiel #6
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                #for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)

            #s_len # of words in sentence
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            #add padding to exist tag_scores(sentencelength * tag_ids)
            observations = T.concatenate([b_s, observations, e_s], axis=0)
            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)

            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Beispiel #7
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX))

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.y = T.ivector('y')

        x = self.x
        y = self.y
        n_hidden = self.n_hidden
        n_in = self.n_in

        # fetch word embeddings
        # (len * batch_size) * n_in
        slices = embedding_layer.forward(x.ravel())
        self.slices = slices

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape((x.shape[0], x.shape[1], n_in))

        # stacking the feature extraction layers
        pooling = args.pooling
        depth = args.depth
        layers = self.layers = []
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0
        softmax_inputs = []
        activation = get_activation_by_name(args.act)
        for i in range(depth):
            if args.layer.lower() == "lstm":
                layer = LSTM(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden)
            elif args.layer.lower() == "strcnn":
                layer = StrCNN(n_in=n_hidden if i > 0 else n_in,
                               n_out=n_hidden,
                               activation=activation,
                               decay=args.decay,
                               order=args.order)
            elif args.layer.lower() == "rcnn":
                layer = RCNN(n_in=n_hidden if i > 0 else n_in,
                             n_out=n_hidden,
                             activation=activation,
                             order=args.order,
                             mode=args.mode)
            else:
                raise Exception("unknown layer type: {}".format(args.layer))

            layers.append(layer)
            prev_output = layer.forward_all(prev_output)
            if pooling:
                softmax_inputs.append(T.sum(prev_output,
                                            axis=0))  # summing over columns
            else:
                softmax_inputs.append(prev_output[-1])
            prev_output = apply_dropout(prev_output, dropout)
            size += n_hidden

        # final feature representation is the concatenation of all extraction layers
        if pooling:
            softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0]
        else:
            softmax_input = T.concatenate(softmax_inputs, axis=1)
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        # feed the feature repr. to the softmax output layer
        layers.append(
            Layer(n_in=size,
                  n_out=self.nclasses,
                  activation=softmax,
                  has_bias=False))

        for l, i in zip(layers, range(len(layers))):
            say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean(
            T.nnet.categorical_crossentropy(self.p_y_given_x, y))

        # adding regularizations
        self.l2_sqr = None
        self.params = []
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Beispiel #8
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
                np.float64(args.dropout_rate).astype(theano.config.floatX)
            )

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.y = T.ivector('y')

        x = self.x
        y = self.y
        n_hidden = self.n_hidden
        n_in = self.n_in

        # fetch word embeddings
        # (len * batch_size) * n_in
        slices  = embedding_layer.forward(x.ravel())
        self.slices = slices

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape( (x.shape[0], x.shape[1], n_in) )

        # stacking the feature extraction layers
        pooling = args.pooling
        depth = args.depth
        layers = self.layers = [ ]
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0
        softmax_inputs = [ ]
        activation = get_activation_by_name(args.act)
        for i in range(depth):
            if args.layer.lower() == "lstm":
                layer = LSTM(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden
                        )
            elif args.layer.lower() == "strcnn":
                layer = StrCNN(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden,
                            activation = activation,
                            decay = args.decay,
                            order = args.order
                        )
            elif args.layer.lower() == "rcnn":
                layer = RCNN(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden,
                            activation = activation,
                            order = args.order,
                            mode = args.mode
                        )
            else:
                raise Exception("unknown layer type: {}".format(args.layer))

            layers.append(layer)
            prev_output = layer.forward_all(prev_output)
            if pooling:
                softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns
            else:
                softmax_inputs.append(prev_output[-1])
            prev_output = apply_dropout(prev_output, dropout)
            size += n_hidden

        # final feature representation is the concatenation of all extraction layers
        if pooling:
            softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0]
        else:
            softmax_input = T.concatenate(softmax_inputs, axis=1)
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        # feed the feature repr. to the softmax output layer
        layers.append( Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = softmax,
                has_bias = False
        ) )

        for l,i in zip(layers, range(len(layers))):
            say("layer {}: n_in={}\tn_out={}\n".format(
                i, l.n_in, l.n_out
            ))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean( T.nnet.categorical_crossentropy(
                                    self.p_y_given_x,
                                    y
                            ))

        # adding regularizations
        self.l2_sqr = None
        self.params = [ ]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Beispiel #9
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        user_embedding_layer = self.user_embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX)
        )

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.w_masks = T.fmatrix('mask')
        self.w_lens = T.fvector('lens')
        self.s_ml = T.iscalar('sent_maxlen')
        self.s_num = T.iscalar('sent_num')
        self.y = T.ivector('y')
        self.usr = T.ivector('users')

        x = self.x
        y = self.y
        usr = self.usr
        w_masks = self.w_masks
        w_lens = self.w_lens
        s_ml = self.s_ml
        s_num = self.s_num
        n_hidden = self.n_hidden
        n_emb = n_in = self.n_in

        layers = self.layers = []

        slicesu = user_embedding_layer.forward(usr)
        slices = embedding_layer.forward(x.ravel())
        self.slices = slices  # important for updating word embeddings

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape((x.shape[0], x.shape[1], n_in))

        pooling = args.pooling
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0

        n_hidden_t = n_hidden
        if args.direction == "bi":
            n_hidden_t = 2 * n_hidden

        softmax_inputs = []
        activation = get_activation_by_name(args.act)

        if args.layer.lower() == "lstm":
            layer = LSTM(n_in=n_in,
                         n_out=n_hidden_t,
                         direction=args.direction
                         )
        elif args.layer.lower() == "cnn":
            layer = CNN(n_in=n_in,
                        n_out=n_hidden_t,
                        activation=activation,
                        order=args.order
                        )
        else:
            raise Exception("unknown layer type: {}".format(args.layer))

        layers.append(layer)
        prev_output = layer.forward_all(prev_output, masks=w_masks)
        prev_output = apply_dropout(prev_output, dropout)

        # final feature representation is the concatenation of all extraction layers
        if args.user_atten:
            layer = IterAttentionLayer(
                n_in=n_emb,
                n_out=n_hidden_t
            )
            layers.append(layer)
            if args.user_atten_base:
                slicesu = None
            softmax_input = layers[-1].multi_hop_forward(
                prev_output, user_embs=slicesu, isWord=True, masks=w_masks)
        else:
            if pooling:
                softmax_input = T.sum(prev_output, axis=0) / w_lens.dimshuffle(0, 'x')
            else:
                ind = T.cast(w_lens - T.ones_like(w_lens), 'int32')
                softmax_input = prev_output[T.arange(ind.shape[0]), ind]

        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        n_in = n_hidden_t
        size = 0
        softmax_inputs = []
        [sentlen, emblen] = T.shape(softmax_input)
        prev_output = softmax_input.reshape(
            (sentlen / s_num, s_num, emblen)).dimshuffle(1, 0, 2)
        if args.layer.lower() == "lstm":
            layer = LSTM(n_in=n_in,
                         n_out=n_hidden_t,
                         direction=args.direction
                         )
        elif args.layer.lower() == "cnn":
            layer = CNN(n_in=n_in,
                        n_out=n_hidden_t,
                        activation=activation,
                        order=args.order,
                        )
        else:
            raise Exception("unknown layer type: {}".format(args.layer))

        layers.append(layer)
        prev_output = layer.forward_all(prev_output)
        prev_output = apply_dropout(prev_output, dropout)

        if args.user_atten:
            layer = IterAttentionLayer(
                n_in=n_emb,
                n_out=n_hidden_t
            )
            layers.append(layer)

            if args.user_atten_base:
                slicesu = None
            softmax_input = layers[-1].multi_hop_forward(
                prev_output, user_embs=slicesu, isWord=False)
        else:
            if pooling:
                softmax_input = T.sum(prev_output, axis=0) / \
                    T.cast(s_num, 'float32')
            else:
                softmax_input = prev_output[-1]
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        size = n_hidden_t
        layers.append(Layer(
            n_in=size,
            n_out=self.nclasses,
            activation=softmax,
            has_bias=False
        ))
        if not args.fix_emb:
            for l, i in zip(layers, range(len(layers))):
                say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))
        else:
            for l, i in zip(layers[1:], range(len(layers[1:]))):
                say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean(T.nnet.categorical_crossentropy(
            self.p_y_given_x,
            y
        ))

        # adding regularizations
        self.l2_sqr = None
        self.params = []
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel())
                      for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Beispiel #10
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')

        cap_ids = T.ivector(name='cap_ids')

        # Sentence length

        # Final input (all word features)
        input_dim = 0
        inputs = []
        s_len = (char_pos_ids).shape[0]
        #
        #
        # Chars inputs
        #

        input_dim += (char_lstm_dim * 2)
        char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

        char_lstm_for = LSTM(char_dim,
                             char_lstm_dim,
                             with_batch=False,
                             name='char_lstm_for')
        char_lstm_rev = LSTM(char_dim,
                             char_lstm_dim,
                             with_batch=False,
                             name='char_lstm_rev')

        char_lstm_for.link(char_layer.link(word_ids))
        char_lstm_rev.link(char_layer.link(cap_ids))

        final_layer = HiddenLayer(char_lstm_dim,
                                  n_chars,
                                  name='final_char_layer',
                                  activation=('softmax'))
        chars_final = final_layer.link(char_lstm_for.h)

        final_rev_layer = HiddenLayer(char_lstm_dim,
                                      n_chars,
                                      name='final_char_rev_layer',
                                      activation=('softmax'))
        chars_rev_final = final_layer.link(char_lstm_rev.h)

        cost_chars = T.nnet.categorical_crossentropy(chars_final,
                                                     char_pos_ids).mean()
        cost_chars_rev = T.nnet.categorical_crossentropy(
            chars_rev_final, tag_ids).mean()

        # Network parameters
        params = []
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            self.add_component(char_lstm_rev)
            params.extend(char_lstm_rev.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        #if cap_dim:

        eval_inputs.append(tag_ids)
        eval_inputs.append(cap_ids)
        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Fetch gradients from both char_lstms
        gradients = T.grad(cost_chars, char_lstm_for.params)
        gradients_rev = T.grad(cost_chars_rev, char_lstm_rev.params)

        # Return forward char_lstm grads
        f_eval = theano.function(inputs=eval_inputs,
                                 outputs=gradients,
                                 givens=({
                                     is_train: np.cast['int32'](0)
                                 } if dropout else {}),
                                 on_unused_input='ignore')

        # Return reverse char_lstm grads
        f_eval_rev = theano.function(inputs=eval_inputs,
                                     outputs=gradients_rev,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='ignore')

        return f_eval, f_eval_rev
Beispiel #11
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              word_to_id=None,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 6

        if self.parameters['pos_dim']:
            n_pos = len(self.id_to_pos)
        if self.parameters['ortho_dim']:
            n_ortho = len(self.id_to_ortho)
        if self.parameters['multi_task']:
            n_segment_tags = len(self.id_to_segment)
        if self.parameters['pre_emb_1_dim']:
            n_words_1 = len(self.id_to_word_1)

            # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')
        if self.parameters['pos_dim']:
            pos_ids = T.ivector(name='pos_ids')
        if self.parameters['ortho_dim']:
            ortho_ids = T.ivector(name='ortho_ids')
        if self.parameters['multi_task']:
            segment_tags_ids = T.ivector(name='segment_tags_ids')
        if self.parameters['pre_emb_1_dim']:
            word_ids_1 = T.ivector(name='doc_ids_dn')
        if self.parameters['language_model']:
            y_fwd_ids = T.ivector(name='y_fwd_ids')
            y_bwd_ids = T.ivector(name='y_bwd_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            print('word_dim: {}'.format(word_dim))
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training and not self.parameters['reload']:
                new_weights = word_layer.embeddings.get_value()
                print(
                    'Loading pretrained embeddings from {}...'.format(pre_emb))
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print('WARNING: {} invalid lines'.format(emb_invalid))
                c_found = 0
                c_lower = 0
                c_zeros = 0
                oov_words = 0
                if self.parameters['emb_of_unk_words']:
                    # TODO
                    # add path as a parameter
                    fast_text_model_p = '/home/ubuntu/usama_ws/resources/Spanish-Corporas/embeddings/fasttext/' \
                                        'fasttext-100d.bin'
                    ft_model = load_model(fast_text_model_p)
                # Lookup table initialization
                for i in range(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                    else:
                        if self.parameters['emb_of_unk_words']:
                            new_weights[i] = ft_model.get_word_vector(word)
                        oov_words += 1

                # set row corresponding to padding token to 0
                new_weights[word_to_id['<PADDING>']] = np.zeros(word_dim)

                word_layer.embeddings.set_value(new_weights)
                print('Loaded {} pretrained embeddings.'.format(
                    len(pretrained)))
                print('{} / {} ({} percent) words have been initialized with '
                      'pretrained embeddings.'.format(
                          c_found + c_lower + c_zeros, n_words,
                          100. * (c_found + c_lower + c_zeros) / n_words))
                print('{} found directly, {} after lowercasing, '
                      '{} after lowercasing + zero.'.format(
                          c_found, c_lower, c_zeros))
                print('oov words count: {}'.format(oov_words))

        #
        # Word inputs
        #
        if self.parameters['pre_emb_1']:
            print('pre_emb_1_dim: {}'.format(self.parameters['pre_emb_1_dim']))
            input_dim += self.parameters['pre_emb_1_dim']
            word_layer_1 = EmbeddingLayer(n_words_1,
                                          word_dim,
                                          name='word_layer_1')
            word_input_1 = word_layer_1.link(word_ids_1)
            inputs.append(word_input_1)
            if training and not self.parameters['reload']:
                # Initialize with pretrained embeddings
                new_weights_1 = word_layer_1.embeddings.get_value()
                print('Loading pretrained embeddings from {}...'.format(
                    self.parameters['pre_emb_1']))
                pretrained_1 = {}
                emb_invalid_1 = 0
                for i, line in enumerate(
                        codecs.open(self.parameters['pre_emb_1'], 'r',
                                    'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == self.parameters['pre_emb_1_dim'] + 1:
                        pretrained_1[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid_1 += 1
                if emb_invalid_1 > 0:
                    print('WARNING: {} invalid lines'.format(emb_invalid_1))
                c_found = 0
                c_lower = 0
                c_zeros = 0
                oov_words = 0
                # Lookup table initialization
                for i in range(n_words_1):
                    word_1 = self.id_to_word_1[i]
                    if word_1 in pretrained_1:
                        new_weights_1[i] = pretrained_1[word_1]
                        c_found += 1
                    elif word_1.lower() in pretrained_1:
                        new_weights_1[i] = pretrained_1[word_1.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word_1.lower()) in pretrained_1:
                        new_weights_1[i] = pretrained_1[re.sub(
                            '\d', '0', word_1.lower())]
                        c_zeros += 1
                    else:
                        oov_words += 1

                word_layer_1.embeddings.set_value(new_weights_1)
                print('Loaded {} pretrained embeddings.'.format(
                    len(pretrained_1)))
                print('{} / {} ({} percent) words have been initialized with '
                      'pretrained embeddings.'.format(
                          c_found + c_lower + c_zeros, n_words,
                          100. * (c_found + c_lower + c_zeros) / n_words))
                print('{} found directly, {} after lowercasing, '
                      '{} after lowercasing + zero.'.format(
                          c_found, c_lower, c_zeros))
                print('oov words count: {}'.format(oov_words))

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))
        if self.parameters['pos_dim']:
            input_dim += self.parameters['pos_dim']
            pos_layer = EmbeddingLayer(n_pos,
                                       self.parameters['pos_dim'],
                                       name='pos_layer')
            inputs.append(pos_layer.link(pos_ids))
            # zeroing the '<UNK>' pos tag row
            # loading reverse mappings
            pos_to_id = {y: x for x, y in self.id_to_pos.items()}
            unk_idx = pos_to_id['<UNK>']
            _pos_wts = pos_layer.embeddings.get_value()
            _pos_wts[unk_idx] = [0.] * self.parameters['pos_dim']
            pos_layer.embeddings.set_value(_pos_wts)
        if self.parameters['ortho_dim']:
            input_dim += self.parameters['ortho_dim']
            ortho_layer = EmbeddingLayer(n_ortho,
                                         self.parameters['ortho_dim'],
                                         name='ortho_layer')
            inputs.append(ortho_layer.link(ortho_ids))
            ortho_to_id = {y: x for x, y in self.id_to_ortho.items()}
            unk_idx = ortho_to_id['<UNK>']
            _pos_wts = ortho_layer.embeddings.get_value()
            _pos_wts[unk_idx] = [0.] * self.parameters['ortho_dim']
            ortho_layer.embeddings.set_value(_pos_wts)

        print('input_dim: {}'.format(input_dim))
        # Prepare final input
        inputs = T.concatenate(inputs,
                               axis=1) if len(inputs) != 1 else inputs[0]
        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            n_h = 2 * word_lstm_dim
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(n_h,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)
        if self.parameters['multi_task']:
            # Sentence to Named Entity Segmentation tags - Score
            segment_layer = HiddenLayer(
                word_lstm_dim,
                n_segment_tags,
                name='segment_layer',
                activation=(None if crf else 'softmax'))
            segment_tags_scores = segment_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
            if self.parameters['multi_task']:
                cost_segment = T.nnet.categorical_crossentropy(
                    segment_tags_scores, segment_tags_ids).mean()
                cost += cost_segment
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

            if self.parameters['multi_task']:
                segment_transitions = shared(
                    (n_segment_tags + 2, n_segment_tags + 2),
                    'segment_transitions')

                seg_small = -1000
                seg_b_s = np.array([[seg_small] * n_segment_tags +
                                    [0, seg_small]]).astype(np.float32)
                seg_e_s = np.array([[seg_small] * n_segment_tags +
                                    [seg_small, 0]]).astype(np.float32)
                segment_observations = T.concatenate(
                    [segment_tags_scores, seg_small * T.ones((s_len, 2))],
                    axis=1)
                segment_observations = T.concatenate(
                    [seg_b_s, segment_observations, seg_e_s], axis=0)

                # Score from tags
                seg_real_path_score = segment_tags_scores[
                    T.arange(s_len), segment_tags_ids].sum()

                # Score from transitions
                seg_b_id = theano.shared(
                    value=np.array([n_segment_tags], dtype=np.int32))
                seg_e_id = theano.shared(
                    value=np.array([n_segment_tags + 1], dtype=np.int32))
                seg_padded_tags_ids = T.concatenate(
                    [seg_b_id, segment_tags_ids, seg_e_id], axis=0)
                seg_real_path_score += segment_transitions[
                    seg_padded_tags_ids[T.arange(s_len + 1)],
                    seg_padded_tags_ids[T.arange(s_len + 1) + 1]].sum()

                seg_all_paths_scores = forward(segment_observations,
                                               segment_transitions)
                cost_segment = -(seg_real_path_score - seg_all_paths_scores)
                cost += cost_segment

        if training and self.parameters['ranking_loss']:

            def recurrence(x_t, y_t):
                token_prob_pos = x_t[y_t]
                arg_max_1 = T.argmax(x_t)
                arg_max_2 = T.argsort(-x_t)[1]
                token_prob_neg = ifelse(T.eq(y_t, arg_max_1), x_t[arg_max_2],
                                        x_t[arg_max_1])
                cost_t = T.max([0, 1.0 - token_prob_pos + token_prob_neg])
                return cost_t

            cost_r, _ = theano.scan(recurrence,
                                    sequences=[tags_scores, tag_ids])
            cum_cost = T.sum(cost_r)
            cost += cum_cost

        if self.parameters['language_model']:
            lm_fwd_layer = HiddenLayer(word_lstm_dim,
                                       n_words,
                                       name='lm_fwd_layer',
                                       activation='softmax')
            lm_fwd_scores = lm_fwd_layer.link(final_output)
            lm_fwd_cost = T.nnet.categorical_crossentropy(
                lm_fwd_scores, y_fwd_ids).mean()
            lm_bwd_layer = HiddenLayer(word_lstm_dim,
                                       n_words,
                                       name='lm_bwd_layer',
                                       activation='softmax')
            lm_bwd_scores = lm_bwd_layer.link(final_output)
            lm_bwd_cost = T.nnet.categorical_crossentropy(
                lm_bwd_scores, y_bwd_ids).mean()
            cost_lm = lm_fwd_cost + lm_bwd_cost
            cost += cost_lm

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if self.parameters['pre_emb_1']:
            self.add_component(word_layer_1)
            params.extend(word_layer_1.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        if self.parameters['pos_dim']:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if self.parameters['ortho_dim']:
            self.add_component(ortho_layer)
            params.extend(ortho_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if self.parameters['multi_task']:
            self.add_component(segment_layer)
            params.extend(segment_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
            if self.parameters['multi_task']:
                self.add_component(segment_transitions)
                params.append(segment_transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)
        if self.parameters['language_model']:
            self.add_component(lm_fwd_layer)
            params.extend(lm_fwd_layer.params)
            self.add_component(lm_bwd_layer)
            params.extend(lm_bwd_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        if self.parameters['pos_dim']:
            eval_inputs.append(pos_ids)
        if self.parameters['ortho_dim']:
            eval_inputs.append(ortho_ids)
        if self.parameters['pre_emb_1']:
            eval_inputs.append(word_ids_1)
        train_inputs = eval_inputs + [tag_ids]
        if self.parameters['multi_task']:
            train_inputs += [segment_tags_ids]
        if self.parameters['language_model']:
            train_inputs.append(y_fwd_ids)
            train_inputs.append(y_bwd_ids)

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print('Compiling...')
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}),
                                      allow_input_downcast=True)
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     allow_input_downcast=True)
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     allow_input_downcast=True)

        return f_train, f_eval
Beispiel #12
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags_loaded = len(self.id_to_tag_old)
        n_tags = len(self.id_to_tag)
        
        print "n_words: ", n_words, "n_chars: ", n_chars, "n_tags_loaded: ", n_tags_loaded, "n_tags(new ones): ", n_tags
        print self.id_to_tag
        print self.id_to_tag_old

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer_init = HiddenLayer(word_lstm_dim, n_tags_loaded, name='final_layer',
                                  activation=(None))
        tags_loaded_scores = final_layer_init.link(final_output)
        
        print word_lstm_dim+n_tags_loaded
        final_layer = HiddenLayer(word_lstm_dim+n_tags_loaded, n_tags, name='final_layer_new',
                                  activation=('softmax'))
        final_out_new = T.concatenate([final_output, tags_loaded_scores], axis=1)
        tags_scores = final_layer.link(final_out_new)

        # No CRF
        cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval
Beispiel #13
0
    def build(self, parameters):
        #{{{
        """
        Build the network.
        """
        #some parameters
        dropout = parameters['dropout']
        char_dim = parameters['char_dim']
        char_lstm_dim = parameters['char_lstm_dim']
        char_bidirect = parameters['char_bidirect']
        word_dim = parameters['word_dim']
        word_lstm_dim = parameters['word_lstm_dim']
        word_bidirect = parameters['word_bidirect']
        lr_method = parameters['lr_method']
        pre_emb = parameters['pre_emb']
        crf = parameters['crf']
        cap_dim = parameters['cap_dim']
        training = parameters['training']
        features = parameters['features']

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        self.output_dim = len(self.id_to_tag)
        self.transitions = shared((self.output_dim + 1, self.output_dim),
                                  'transitions')

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        if features is not None and features['lemma']['isUsed']:
            lemma_ids = T.ivector(name='lemma_ids')
        if features is not None and features['pos']['isUsed']:
            pos_ids = T.ivector(name='pos_ids')
        if features is not None and features['chunk']['isUsed']:
            chunk_ids = T.ivector(name='chunk_ids')
        if features is not None and features['NER']['isUsed']:
            dic_ids = T.ivector(name='dic_ids')

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        # Word inputs
        #{{{
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            #for attention
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (
                        c_found, c_lower, c_zeros)  #}}}

        # Chars inputs
#{{{
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
#}}}

# Capitalization feature
#
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:

            #all_paths_scores = forward(observations, self.transitions)
            #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores)
            #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ;
            #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5);
            #cost=-error;
            #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2)

            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
            real_path_score += self.transitions[
                padded_tags_ids[T.arange(s_len)],
                padded_tags_ids[T.arange(s_len) + 1]].sum()

            all_paths_scores = forward(tags_scores, self.transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(self.transitions)
            params.append(self.transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            import optimizers
            self.optimizer = optimizers.RMSprop(lr=0.001)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            self.constraints = {}
            #updates = self.optimizer.get_updates(params,self.constraints,cost);
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
            #for debug
            #f_Debug = theano.function(
            #    inputs=train_inputs,
            #    outputs=cost,
            #    updates=self.update,
            #    givens=({is_train: np.cast['int32'](1)} if dropout else {})
            #)
            #debug end
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         tags_scores,
                                         self.transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Beispiel #14
0
    def build4(self, parameters):
        #{{{
        """
        Build the network.
        """
        #some parameters
        dropout = parameters['dropout']
        char_dim = parameters['char_dim']
        char_lstm_dim = parameters['char_lstm_dim']
        char_bidirect = parameters['char_bidirect']
        word_dim = parameters['word_dim']
        word_lstm_dim = parameters['word_lstm_dim']
        word_bidirect = parameters['word_bidirect']
        lr_method = parameters['lr_method']
        pre_emb = parameters['pre_emb']
        crf = parameters['crf']
        cap_dim = parameters['cap_dim']
        training = parameters['training']
        features = parameters['features']
        useAttend = parameters['useAttend']
        if useAttend:
            reloadParam = parameters['loading']
        else:
            reloadParam = None
        if reloadParam is not None:
            reloadPath = parameters['loading_path']
        sentencesLevelLoss = parameters['sentencesLevelLoss']

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        self.output_dim = len(self.id_to_tag)
        self.transitions = shared((self.output_dim + 1, self.output_dim),
                                  'transitions')

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        wordTrue_ids = T.ivector(name='wordTrue_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        docLen = T.ivector(name='docLen')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        #some features
        if features is not None and features['lemma']['isUsed']:
            lemma_ids = T.ivector(name='lemma_ids')
        if features is not None and features['pos']['isUsed']:
            pos_ids = T.ivector(name='pos_ids')
        if features is not None and features['chunk']['isUsed']:
            chunk_ids = T.ivector(name='chunk_ids')
        if features is not None and features['dic']['isUsed']:
            dic_ids = T.ivector(name='dic_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        # Word inputs
        #{{{
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            wordTrue_input = word_layer.link(wordTrue_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (
                        c_found, c_lower, c_zeros)  #}}}

        # Chars inputs
#{{{
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_output = T.concatenate([char_for_output, char_rev_output],
                                        axis=-1)
            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
#}}}

# Capitalization feature
#
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        #add feature
#{{{
        if features is not None and features['lemma']['isUsed']:
            lemma_layer = EmbeddingLayer(features['lemma']['num'],
                                         features['lemma']['dim'],
                                         name='lemma_layer')
            if features['lemma']['pre_emb'] is not "":
                new_weights = lemma_layer.embeddings.get_value()
                loadPreEmbFeatures(features['lemma']['pre_emb'],
                                   features['feature_to_id_map']['lemma'],
                                   new_weights,
                                   lower=True)
                lemma_layer.embeddings.set_value(new_weights)
            lemma_output = lemma_layer.link(lemma_ids)
            if features['lemma']['lstm-input']:
                input_dim += features['lemma']['dim']
                inputs.append(lemma_output)
        if features is not None and features['pos']['isUsed']:
            pos_layer = EmbeddingLayer(features['pos']['num'],
                                       features['pos']['dim'],
                                       name='pos_layer')
            if features['pos']['pre_emb'] is not "":
                new_weights = pos_layer.embeddings.get_value()
                loadPreEmbFeatures(features['pos']['pre_emb'],
                                   features['feature_to_id_map']['pos'],
                                   new_weights)
                pos_layer.embeddings.set_value(new_weights)
            pos_output = pos_layer.link(pos_ids)
            if features['pos']['lstm-input']:
                input_dim += features['pos']['dim']
                inputs.append(pos_output)
        if features is not None and features['chunk']['isUsed']:
            chunk_layer = EmbeddingLayer(features['chunk']['num'],
                                         features['chunk']['dim'],
                                         name='chunk_layer')
            chunk_output = chunk_layer.link(chunk_ids)
            if features['chunk']['lstm-input']:
                input_dim += features['chunk']['dim']
                inputs.append(chunk_output)
        if features is not None and features['dic']['isUsed']:
            dic_layer = EmbeddingLayer(features['dic']['num'],
                                       features['dic']['dim'],
                                       name='dic_layer')
            dic_output = dic_layer.link(dic_ids)
            if features['dic']['lstm-input']:
                input_dim += features['dic']['dim']
                inputs.append(dic_output)
#}}}

# Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        if sentencesLevelLoss:

            def sentLSTM(i, output, input, lenVec):
                #{{{
                Len = lenVec[i]
                accLen = lenVec[:i].sum()
                currentInput = input[accLen:accLen + Len]
                word_lstm_for.link(currentInput)
                word_lstm_rev.link(currentInput[::-1, :])
                wordForOutput = word_lstm_for.h
                wordRevOutput = word_lstm_rev.h[::-1, :]
                finalOutput = T.concatenate([wordForOutput, wordRevOutput],
                                            axis=-1)
                output = T.set_subtensor(output[accLen:accLen + Len],
                                         finalOutput)
                return output
    #}}}

            result, update = theano.scan(
                fn=sentLSTM,
                outputs_info=T.zeros((inputs.shape[0], word_lstm_dim * 2),
                                     dtype='float32'),
                sequences=[T.arange(docLen.shape[0])],
                non_sequences=[inputs, docLen])

            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_for_c = word_lstm_for.c
            word_rev_output = word_lstm_rev.h[::-1, :]
            word_rev_c = word_lstm_rev.c[::-1, :]

            final_c = T.concatenate([word_for_c, word_rev_c], axis=-1)
            final_output = result[-1]
        else:
            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_for_c = word_lstm_for.c
            word_rev_output = word_lstm_rev.h[::-1, :]
            word_rev_c = word_lstm_rev.c[::-1, :]
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=-1)
            final_c = T.concatenate([word_for_c, word_rev_c], axis=-1)

        if useAttend:
            #attention layer
            attended = []
            attendedDim = 0
            if features is not None and features['word']['attended']:
                attended.append(wordTrue_input)
                attendedDim += word_dim
            if features is not None and features['char']['attended']:
                attended.append(char_output)
                attendedDim += char_lstm_dim * 2
            if features is not None and features['lemma']['attended']:
                attended.append(lemma_output)
                attendedDim += features['lemma']['dim']
            if features is not None and features['pos']['attended']:
                attended.append(pos_output)
                attendedDim += features['pos']['dim']
            if features is not None and features['chunk']['attended']:
                attended.append(chunk_output)
                attendedDim += features['chunk']['dim']
            if features is not None and features['dic']['attended']:
                attended.append(dic_output)
                attendedDim += features['dic']['dim']

            attention_layer = AttentionLayer(
                attended_dim=attendedDim,
                state_dim=attendedDim,
                #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2,
                #                               state_dim=word_lstm_dim*2,
                source_dim=word_lstm_dim * 2,
                scoreFunName=parameters['attenScoreFun'],
                name='attention_layer')

            if len(attended) > 1:
                attendedInput = T.concatenate(attended, axis=-1)
            else:
                attendedInput = attended[0]

            final_output = attention_layer.link(attendedInput, attendedInput,
                                                final_output)
            #using lstm_state to compute attention
            #final_output=attention_layer.link(final_output,final_c,final_output);
            self.energy = attention_layer.energy
        else:
            final_output = final_output

        tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                 word_lstm_dim,
                                 name='tanh_layer',
                                 activation='tanh')
        final_output = tanh_layer.link(final_output)

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            if sentencesLevelLoss:
                #calcuate loss according to sentence instead of docLen
                def sentLoss(i, scores, trueIds, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentIds = trueIds[accLen:accLen + Len]
                    real_path_score = currentTagsScores[T.arange(Len),
                                                        currentIds].sum()
                    # Score from transitions
                    padded_tags_ids = T.concatenate([[n_tags], currentIds],
                                                    axis=0)
                    real_path_score += transitions[
                        padded_tags_ids[T.arange(Len)],
                        padded_tags_ids[T.arange(Len) + 1]].sum()

                    all_paths_scores = forward(currentTagsScores, transitions)
                    cost = -(real_path_score - all_paths_scores)
                    return cost

    #}}}

                result, update = theano.scan(
                    fn=sentLoss,
                    outputs_info=None,
                    sequences=[T.arange(docLen.shape[0])],
                    non_sequences=[
                        tags_scores, tag_ids, self.transitions, docLen
                    ])
                cost = result.sum()
            else:
                real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

                # Score from transitions
                padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
                real_path_score += self.transitions[
                    padded_tags_ids[T.arange(s_len)],
                    padded_tags_ids[T.arange(s_len) + 1]].sum()

                all_paths_scores = forward(tags_scores, self.transitions)
                cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(self.transitions)
            params.append(self.transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)
        #add feature layer
        if features is not None and features['lemma']['isUsed']:
            self.add_component(lemma_layer)
            params.extend(lemma_layer.params)
        if features is not None and features['pos']['isUsed']:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if features is not None and features['chunk']['isUsed']:
            self.add_component(chunk_layer)
            params.extend(chunk_layer.params)
        if features is not None and features['dic']['isUsed']:
            self.add_component(dic_layer)
            params.extend(dic_layer.params)

        if useAttend and reloadParam:
            #reload pre-train params
            model_path = self.model_path
            self.model_path = reloadPath
            print "loading:", self.model_path
            self.reload(features)
            self.model_path = model_path

        if useAttend:
            #add attention_layer
            self.add_component(attention_layer)
            params.extend(attention_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        if useAttend:
            eval_inputs.append(wordTrue_ids)
            if sentencesLevelLoss:
                eval_inputs.append(docLen)
        #add feature input
        if features is not None and features['lemma']['isUsed']:
            eval_inputs.append(lemma_ids)
        if features is not None and features['pos']['isUsed']:
            eval_inputs.append(pos_ids)
        if features is not None and features['chunk']['isUsed']:
            eval_inputs.append(chunk_ids)
        if features is not None and features['dic']['isUsed']:
            eval_inputs.append(dic_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            #constraints
            if useAttend:
                self.constraints = attention_layer.constraints
            else:
                self.constraints = {}
            from keras import optimizers
            self.optimizer = optimizers.SGD(lr=0.001,
                                            momentum=0.9,
                                            decay=0.,
                                            nesterov=True,
                                            clipvalue=5)
            self.optimizer = optimizers.RMSprop()
            #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name,
                cost,
                params,
                constraints=self.constraints,
                **lr_method_parameters)
            #updates = self.optimizer.get_updates(params,self.constraints,cost);
            f_train_outputs = [cost]
            if useAttend:
                f_train_outputs.append(self.energy)

            f_train = theano.function(inputs=train_inputs,
                                      outputs=f_train_outputs,
                                      updates=updates,
                                      on_unused_input='ignore',
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))

            f_test = theano.function(inputs=train_inputs,
                                     outputs=cost,
                                     on_unused_input='ignore',
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            self.f_test = f_test
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            if sentencesLevelLoss:

                def sentVitebe(i, predictTag, scores, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentPredictIds = forward(currentTagsScores,
                                                transitions,
                                                viterbi=True,
                                                return_alpha=False,
                                                return_best_sequence=True)
                    predictTag = T.set_subtensor(
                        predictTag[accLen:accLen + Len], currentPredictIds)
                    return predictTag
                    #}}}

                predictTag, update = theano.scan(
                    fn=sentVitebe,
                    outputs_info=T.zeros((tags_scores.shape[0], ),
                                         dtype='int32'),
                    sequences=[T.arange(docLen.shape[0])],
                    non_sequences=[tags_scores, self.transitions, docLen])
                predictTag = predictTag[-1]
            else:
                predictTag = forward(tags_scores,
                                     self.transitions,
                                     viterbi=True,
                                     return_alpha=False,
                                     return_best_sequence=True)
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=predictTag,
                                     on_unused_input='ignore',
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            #f_AttenVisual=theano.function(
            #    inputs=eval_inputs,
            #    outputs=[predictTag,self.energy],
            #    on_unused_input='ignore',
            #    givens=({is_train: np.cast['int32'](0)} if dropout else {})
            #    )
            #self.f_AttenVisual=f_AttenVisual;

        return f_train, f_eval
Beispiel #15
0
    def ready(self, args, train):
        # len * batch
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = T.matrix(dtype=theano.config.floatX)

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(
                n_d = self.n_d,
                vocab = set(w for w in train)
            )
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(
                self.n_V, self.n_d
            ))

        activation = get_activation_by_name(args["activation"])

        rnn_layer = LSTM(
                 n_in = self.n_d,
                 n_out = self.n_d,
                 activation = activation
            )

        output_layer = Layer(
                n_in = self.n_d,
                n_out = self.n_V,
                activation = T.nnet.softmax,
            )

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        x = x.reshape( (self.idxs.shape[0], self.idxs.shape[1], self.n_d) )

        # len * batch * (n_d+n_d)
        h = rnn_layer.forward_all(x, self.init_state, return_c=True)

        self.last_state = h[-1]
        h = h[:,:,self.n_d:]
        h = apply_dropout(h, self.dropout)

        self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys])
        #self.nll = T.nnet.categorical_crossentropy(
        #                self.p_y_given_x,
        #                idys
        #            )

        self.layers = [ embedding_layer, rnn_layer, output_layer ]
        #self.params = [ x_flat ] + rnn_layer.params + output_layer.params
        self.params = embedding_layer.params + rnn_layer.params + output_layer.params
        self.num_params = sum(len(x.get_value(borrow=True).ravel())
                                for l in self.layers for x in l.params)
        say("# of params in total: {}\n".format(self.num_params))
Beispiel #16
0
    def build(
            self,
            dropout,
            ortho_char_input_dim,  # Should be inferred from the input
            ortho_char_dim,
            ortho_char_lstm_dim,
            char_bidirect,
            word_vec_input_dim,  # Should be inferred from the input wvecs
            word_dim,  # The vector size after projection of the input vector
            word_lstm_dim,
            word_bidirect,
            lr_method,
            crf,
            use_type_sparse_feats,
            type_sparse_feats_input_dim,  # Can be inferred from the output of the feature extractors
            type_sparse_feats_proj_dim,  # This is a hyper-parameter
            use_token_sparse_feats,
            token_sparse_feats_input_dim,  # Can be inferred from the output of the feature extractors
            # token_sparse_feats_proj_dim,  # This is a hyper-parameter
        use_ortho_attention,
            use_phono_attention,
            # use_convolution,
            phono_char_input_dim,  # Can be inferred
            phono_char_dim,
            phono_char_lstm_dim,
            training=True,
            **kwargs):
        """
        Build the network.
        """
        assert word_dim or phono_char_dim or ortho_char_dim, "No input selected while building the network!"
        # Training parameters
        n_tags = len(self.id_to_tag)

        # Network variables
        is_train = T.iscalar('is_train')
        word_vecs = T.dmatrix(
            name="word_vecs")  # A vector for each word in the sentence
        #  => matrix: (len_sent, w_emb_dim)
        ortho_char_for_vecs = T.dtensor3(
            name="ortho_char_for_vecs"
        )  # For each char of each word in the sentence, a char vector
        # ortho_char_for_vecs = T.ftensor3(name="ortho_char_for_vecs")
        # => tensor of form: (len_sent, max_wchar_len, char_emb_dim)
        ortho_char_rev_vecs = T.dtensor3(name="ortho_char_rev_vecs")
        # ortho_char_rev_vecs = T.ftensor3(name="ortho_char_rev_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_wchar_len, char_emb_dim)
        phono_char_for_vecs = T.dtensor3(name="phono_char_for_vecs")
        # phono_char_for_vecs = T.ftensor3(name="phono_char_for_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_ortho_char_len, char_emb_dim)
        phono_char_rev_vecs = T.dtensor3(name="phono_char_rev_vecs")
        # phono_char_rev_vecs = T.ftensor3(name="phono_char_rev_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_phono_char_len, char_emb_dim)
        ortho_char_pos_ids = T.ivector(name='ortho_char_pos_ids')
        # The word len for each word in the sentence => vect of form: (len_sent,)
        phono_char_pos_ids = T.ivector(name='phono_char_pos_ids')
        # The word len for each word in the sentence => vect of form: (len_sent,)
        type_sparse_feats = T.imatrix(name="type_sparse_feats")
        # Type sparse features are appended to the input to the word lstm
        # For each word, a vector of type level sparse feats => mat of form: (len_sent, type_sparse_dim)
        token_sparse_feats = T.imatrix(name="token_sparse_feats")
        # Token sparse features are appended to the pre-crf layer
        # For each word, a vector of token level sparse feats => mat of form: (len_sent, token_sparse_dim)

        tag_ids = T.ivector(name='tag_ids')
        # The tag id for each word in the sentence => vect of form: (len_sent,)

        # Sentence length
        s_len = (word_vecs if word_dim else ortho_char_pos_ids
                 if ortho_char_dim else phono_char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = HiddenLayer(word_vec_input_dim,
                                     word_dim,
                                     activation="tanh",
                                     name="word_emb_proj")
            # TO DO : Try not using the bias term in the hidden layer
            word_input = word_layer.link(word_vecs)
            inputs.append(word_input)

        #
        # Chars inputs
        #
        if ortho_char_dim:
            input_dim += ortho_char_lstm_dim
            ortho_char_layer = HiddenLayer(ortho_char_input_dim,
                                           ortho_char_dim,
                                           activation="tanh",
                                           name="ortho_char_emb_proj")
            # TO DO : Try not using bias in the hidden layer
            ortho_char_lstm_for = LSTM(ortho_char_dim,
                                       ortho_char_lstm_dim,
                                       with_batch=True,
                                       name='ortho_char_lstm_for')
            ortho_char_lstm_rev = LSTM(ortho_char_dim,
                                       ortho_char_lstm_dim,
                                       with_batch=True,
                                       name='ortho_char_lstm_rev')
            ortho_char_lstm_for.link(
                ortho_char_layer.link(ortho_char_for_vecs))
            ortho_char_lstm_rev.link(
                ortho_char_layer.link(ortho_char_rev_vecs))

            ortho_char_for_output = ortho_char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids]
            ortho_char_rev_output = ortho_char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids]

            inputs.append(ortho_char_for_output)
            if char_bidirect:
                inputs.append(ortho_char_rev_output)
                input_dim += ortho_char_lstm_dim

        if phono_char_dim:
            input_dim += phono_char_lstm_dim
            phono_char_layer = HiddenLayer(phono_char_input_dim,
                                           phono_char_dim,
                                           activation="tanh",
                                           name="phono_char_emb_proj")
            # TO DO : Try not using bias in the hidden layer
            phono_char_lstm_for = LSTM(phono_char_dim,
                                       phono_char_lstm_dim,
                                       with_batch=True,
                                       name='phono_char_lstm_for')
            phono_char_lstm_rev = LSTM(phono_char_dim,
                                       phono_char_lstm_dim,
                                       with_batch=True,
                                       name='phono_char_lstm_rev')

            phono_char_lstm_for.link(
                phono_char_layer.link(phono_char_for_vecs))
            phono_char_lstm_rev.link(
                phono_char_layer.link(phono_char_rev_vecs))

            phono_char_for_output = phono_char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), phono_char_pos_ids]
            phono_char_rev_output = phono_char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), phono_char_pos_ids]

            inputs.append(phono_char_for_output)
            if char_bidirect:
                inputs.append(phono_char_rev_output)
                input_dim += phono_char_lstm_dim

        # Type level sparse feats
        #
        if use_type_sparse_feats:
            input_dim += type_sparse_feats_input_dim
            type_level_sparse_layer = HiddenLayer(
                type_sparse_feats_input_dim,
                type_sparse_feats_proj_dim,
                activation="tanh",
                name='type_level_sparse_layer')
            # TO DO : Try not using the hidden layer here
            inputs.append(type_level_sparse_layer.link(type_sparse_feats))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)
            # TO DO : If using type sparse features, then apply hidden layer after concatenating all inputs
        else:
            inputs = inputs[0]
        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            """
            Drop out involves sampling a vector of bernoulli random variables with a parameter 1-p and using it as a mask
            So, the expected value of the dropped out input is p * (0*x) + (1-p) * (1*x) = (1-p) * x. Since biases will
            on average respond to the expected input value, at test time we multiply test inputs (1-p) to supply the
            expected test input instead.
            """
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        lstm_outputs = [word_for_output]
        post_word_lstm_output_size = word_lstm_dim
        if use_token_sparse_feats:
            # token_level_sparse_layer = HiddenLayer(token_sparse_feats_input_dim, token_sparse_feats_proj_dim,
            #                                       activation="tanh",
            #                                       name='token_level_sparse_layer')
            # # TO DO : Try not using the hidden layer here
            # lstm_outputs.append(token_level_sparse_layer.link(token_sparse_feats))
            # post_word_lstm_output_size += token_sparse_feats_proj_dim
            lstm_outputs.append(token_sparse_feats)
            post_word_lstm_output_size += token_sparse_feats_input_dim
        if word_bidirect:
            lstm_outputs.append(word_rev_output)
            post_word_lstm_output_size += word_lstm_dim

        if len(lstm_outputs) > 1:
            final_output = T.concatenate(lstm_outputs, axis=1)
            tanh_layer = HiddenLayer(post_word_lstm_output_size,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)

        else:
            final_output = word_for_output

        final_pre_crf_input_size = word_lstm_dim
        attention_vectors = []
        attention_vector_size = 0
        if use_ortho_attention and ortho_char_dim:
            # final_ortho_attention_input_layer = HiddenLayer(post_word_lstm_output_size, ortho_char_lstm_dim,
            #                                   name='final_ortho_attention_input_layer', activation='tanh')
            final_ortho_attention_input_layer = HiddenLayer(
                word_lstm_dim,
                ortho_char_lstm_dim,
                name='final_ortho_attention_input_layer',
                activation='tanh')
            final_ortho_attention_input = final_ortho_attention_input_layer.link(
                final_output)
            # Evaluating attentional vector using a linear projection from final_output since the attention vector
            # must be conditioned on it and dimension must match the char lstm hidden dim.
            ortho_for_attention = self.get_TDAttention_vector(
                final_ortho_attention_input,
                ortho_char_lstm_for.h.dimshuffle((1, 0, 2)),
                ortho_char_pos_ids)
            if char_bidirect:
                ortho_rev_attention = self.get_TDAttention_vector(
                    final_ortho_attention_input,
                    ortho_char_lstm_rev.h.dimshuffle((1, 0, 2)),
                    ortho_char_pos_ids)
                attention_vectors.append(ortho_rev_attention)
                attention_vector_size += ortho_char_lstm_dim
            attention_vectors.append(ortho_for_attention)
            attention_vector_size += ortho_char_lstm_dim
        if use_phono_attention and phono_char_dim:
            # final_phono_attention_input_layer = HiddenLayer(post_word_lstm_output_size, phono_char_lstm_dim,
            #                                               name='final_phono_attention_input_layer', activation='tanh')
            final_phono_attention_input_layer = HiddenLayer(
                word_lstm_dim,
                phono_char_lstm_dim,
                name='final_phono_attention_input_layer',
                activation='tanh')
            # Evaluating attentional vector using a linear projection from final_output since the attention vector
            # must be conditioned on it and dimension must match the char lstm hidden dim.
            final_phono_attention_input = final_phono_attention_input_layer.link(
                final_output)
            phono_for_attention = self.get_TDAttention_vector(
                final_phono_attention_input,
                phono_char_lstm_for.h.dimshuffle((1, 0, 2)),
                phono_char_pos_ids)
            if char_bidirect:
                phono_rev_attention = self.get_TDAttention_vector(
                    final_phono_attention_input,
                    phono_char_lstm_rev.h.dimshuffle((1, 0, 2)),
                    phono_char_pos_ids)
                attention_vectors.append(phono_rev_attention)
                attention_vector_size += phono_char_lstm_dim
            attention_vectors.append(phono_for_attention)
            attention_vector_size += phono_char_lstm_dim
        if len(attention_vectors) > 1:
            attention_vectors = T.concatenate(attention_vectors, axis=1)

        if use_phono_attention or use_ortho_attention:
            final_output = T.concatenate([final_output, attention_vectors],
                                         axis=1)
            post_word_lstm_output_size += attention_vector_size
            final_pre_crf_input_size += attention_vector_size

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(final_pre_crf_input_size,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
            # n_tags + 2 to accommodate start and end symbols

            small = -1000  # = -log(inf)
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            # Score of starting at start symbol is 1 => -log(1) = 0. Score of start symbol emitting any other NER
            # tag is -log(inf) = small
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            # Score of ending at end symbol is 1 => -log(1) = 0. Score of end symbol emitting any other NER
            # tag is -log(inf) = small
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            # observations is the emission energy (-log potential) between each token and each tag.
            # Emission score of intermediate words towards start and end tags is -log(inf)

            observations = T.concatenate([b_s, observations, e_s], axis=0)
            # observations now contains the emission energies for start token, sentence tokens and end token

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
            # Sum of energies associated with the gold tags

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()
            # Transition scores from label_i to label_{i+1}

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if ortho_char_dim:
            self.add_component(ortho_char_layer)
            self.add_component(ortho_char_lstm_for)
            params.extend(ortho_char_layer.params)
            params.extend(ortho_char_lstm_for.params)
            if char_bidirect:
                self.add_component(ortho_char_lstm_rev)
                params.extend(ortho_char_lstm_rev.params)

        if phono_char_dim:
            self.add_component(phono_char_layer)
            self.add_component(phono_char_lstm_for)
            params.extend(phono_char_layer.params)
            params.extend(phono_char_lstm_for.params)
            if char_bidirect:
                self.add_component(phono_char_lstm_rev)
                params.extend(phono_char_lstm_rev.params)

        if use_type_sparse_feats:
            self.add_component(type_level_sparse_layer)
            params.extend(type_level_sparse_layer.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)

        if word_bidirect or len(lstm_outputs) > 1:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        if use_ortho_attention and ortho_char_dim:
            self.add_component(final_ortho_attention_input_layer)
            params.extend(final_ortho_attention_input_layer.params)
        if use_phono_attention and phono_char_dim:
            self.add_component(final_phono_attention_input_layer)
            params.extend(final_phono_attention_input_layer.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            # eval_inputs.append(word_ids)
            eval_inputs.append(word_vecs)
        if ortho_char_dim:
            # eval_inputs.append(char_for_ids)
            eval_inputs.append(ortho_char_for_vecs)
            if char_bidirect:
                # eval_inputs.append(char_rev_ids)
                eval_inputs.append(ortho_char_rev_vecs)
            eval_inputs.append(ortho_char_pos_ids)
        if phono_char_dim:
            # eval_inputs.append(char_for_ids)
            eval_inputs.append(phono_char_for_vecs)
            if char_bidirect:
                # eval_inputs.append(char_rev_ids)
                eval_inputs.append(phono_char_rev_vecs)
            eval_inputs.append(phono_char_pos_ids)

        if use_type_sparse_feats:
            eval_inputs.append(type_sparse_feats)
        if use_token_sparse_feats:
            eval_inputs.append(token_sparse_feats)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        print("Finished Compiling")
        return f_train, f_eval
Beispiel #17
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = ZLayer(
            n_in=size, n_hidden=args.hidden_dimension2, activation=activation)

        # sample z given text (i.e. x)
        z_pred, sample_updates = output_layer.sample_all(h_final)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        probs = output_layer.forward_all(h_final, z_pred)
        print "probs", probs.ndim

        logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Beispiel #18
0
    def ready(self):
	args = self.args
	w_emb_layer = self.w_emb_layer
	c_emb_layer = self.c_emb_layer
	r_emb_layers = self.r_emb_layers
	r_matrix_layers = self.r_matrix_layers	

	char_dim = self.char_dim = args.char_dim
	char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim
	word_dim = self.word_dim = args.word_dim
	word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim
	
	dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

	word_ids = self.word_ids = T.ivector('word_ids')
	char_ids = self.char_ids = T.imatrix('char_ids')
	char_lens = self.char_lens = T.fvector('char_lens')
	char_masks = self.char_masks = T.imatrix('char_masks')
	up_ids = self.up_ids = T.imatrix('up_ids')
	up_rels = self.up_rels = T.imatrix('up_rels')
	up_id_masks = self.up_id_masks = T.imatrix('up_id_masks')
	down_ids = self.down_ids = T.imatrix('down_ids')
	down_rels = self.down_rels = T.imatrix('down_rels')
	down_id_masks = self.down_id_masks = T.imatrix('down_id_masks')
	tag_ids = self.tag_ids = T.ivector('tag_ids')
	
	layers = self.layers = [w_emb_layer, c_emb_layer]
	layers.extend(r_emb_layers)
	layers.extend(r_matrix_layers)	

	inputs = self.inputs = []

	inputs.append(self.word_ids)
	inputs.append(self.char_ids)
	inputs.append(self.char_lens)
	inputs.append(self.char_masks)
	inputs.append(self.up_ids)
	inputs.append(self.up_rels)
	inputs.append(self.up_id_masks)
	inputs.append(self.down_ids)
	inputs.append(self.down_rels)
	inputs.append(self.down_id_masks)
	inputs.append(self.tag_ids)
	wslices = w_emb_layer.forward(word_ids)
	cslices = c_emb_layer.forward(char_ids.ravel())
	cslices = cslices.reshape((char_ids.shape[0], char_ids.shape[1], char_dim))
	cslices = cslices.dimshuffle(1, 0, 2)
	
	bv_ur_slicess = []
        bv_dr_slicess = []
        b_ur_slicess = []
        b_dr_slicess = []
	
	bv_ur_matrixss = []
	bv_dr_matrixss = []
	b_ur_matrixss = []
	b_dr_matrixss = []
	
	for r_matrix_layer in r_matrix_layers:
            bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel())
            bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel())
            b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel())
            b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel())
            bv_ur_matrixss.append(bv_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim)))
            bv_dr_matrixss.append(bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim)))
            b_ur_matrixss.append(b_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim)))
            b_dr_matrixss.append(b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim)))
	
	for r_emb_layer in r_emb_layers:
            bv_ur_slices = r_emb_layer.forward(up_rels.ravel())
            bv_dr_slices = r_emb_layer.forward(down_rels.ravel())
            b_ur_slices = r_emb_layer.forward2(up_rels.ravel())
            b_dr_slices = r_emb_layer.forward2(down_rels.ravel())
            bv_ur_slicess.append(bv_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim)))
            bv_dr_slicess.append(bv_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim)))
            b_ur_slicess.append(b_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim)))
            b_dr_slicess.append(b_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim)))

	char_masks = char_masks.dimshuffle(1, 0)

	prev_output = wslices
	prev_size = word_dim

	if char_dim:
	    layers.append(LSTM(
		n_in = char_dim,
		n_out = char_lstm_dim,
		direction = 'bi' if args.char_bidirect else 'si'	
	    ))
	    prev_output_2 = cslices
	    prev_output_2 = apply_dropout(prev_output_2, dropout, v2 = True)
	    prev_output_2 = layers[-1].forward_all(cslices, char_masks)
	    prev_output_2 = T.sum(prev_output_2, axis = 0)
	    prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) + char_lens).dimshuffle(0, 'x')

	    prev_size += char_lstm_dim
	    prev_output = T.concatenate([prev_output, prev_output_2], axis = 1)
	
	prev_output = apply_dropout(prev_output, dropout)
	if args.conv != 0:
	    for i in range(args.clayer):
            	layers.append(GKNNMultiHeadGate(
                        n_in = prev_size,
                        n_out = prev_size,
			n_head = args.head
                        ))
	    	prev_output = layers[-1].forward_all(prev_output, up_ids, up_id_masks, bv_ur_slicess[0], down_ids, down_id_masks, bv_dr_slicess[0])
	    	prev_output = apply_dropout(prev_output, dropout)
	
	
	#prev_size *= 2
	#layers.append(LSTM(
	#    n_in = prev_size,
	#    n_out = word_lstm_dim,
	#    direction = 'bi' if args.word_bidirect else 'si'
	#))
	
	#prev_output = prev_output.dimshuffle(0, 'x', 1)
	#prev_output = layers[-1].forward_all(prev_output)
	#prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[-1]))
	
	#prev_size = word_lstm_dim
	
	layers.append(Layer(
	    n_in = prev_size,
	    n_out = args.classes,
	    activation = linear, #ReLU,
	    has_bias = False
	))

	n_tags = args.classes
	s_len = char_ids.shape[0]
	tags_scores = layers[-1].forward(prev_output)
	transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
	small = -1000
        b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        observations = T.concatenate(
            [tags_scores, small * T.ones((s_len, 2))],
            axis=1
        )
	
        observations = T.concatenate(
            [b_s, observations, e_s],
            axis=0
        )

        real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
	b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
	
	pre_ids = T.arange(s_len + 1)
	
	s_ids = T.arange(s_len + 1) + 1
	
        real_path_score += transitions[
           padded_tags_ids[pre_ids],
           padded_tags_ids[s_ids]
        ].sum()
	
	all_paths_scores = CRFForward(observations, transitions)
        self.nll_loss = nll_loss = - (real_path_score - all_paths_scores)
        preds = CRFForward(observations, transitions, viterbi = True,
                        return_alpha = False, return_best_sequence=True)
        
	self.pred = preds[1:-1]
	
	self.l2_sqr = None
        params = self.params = [transitions]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

	
	#for l, i in zip(layers[3:], range(len(layers[3:]))):
        for l, i in zip(layers[2+len(r_emb_layers)+len(r_matrix_layers):], range(len(layers[2+len(r_emb_layers)+len(r_matrix_layers):]))):
	    say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
	
	cost = self.nll_loss + self.l2_sqr

	lr_method_name = args.learning
	lr_method_parameters = {}
	lr_method_parameters['lr'] = args.learning_rate
	updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
	
	f_train = theano.function(
	    	inputs = self.inputs,
		outputs = [cost, nll_loss],
		updates = updates,
		allow_input_downcast = True
	)

	f_eval = theano.function(
		inputs = self.inputs[:-1],
		outputs = self.pred,
		allow_input_downcast = True
	)
	
	return f_train, f_eval
Beispiel #19
0
    def build(self,
              dropout,
              char_dim,
              char_hidden_dim,
              char_bidirect,
              layer2_hidden_dim,
              lr_method,
              layer2,
              batch_size,
              pre_emb,
              use_gaze,
              crf,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Network variables
        is_train = T.iscalar('is_train')  # declare variable,声明整型变量is_train
        char_ids = T.ivector(name='char_ids')  #声明整型一维向量
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        # tag_ids = T.imatrix(name='tag_ids')
        tag_ids = T.ivector(name='tag_ids')
        # Sentence length
        s_len = char_ids.shape[0]  #每个句子中的字数

        # Final input (all word features)
        #
        # Char inputs
        #
        if char_dim:
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_input = char_layer.link(char_ids)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif char.lower() in pretrained:
                        new_weights[i] = pretrained[char.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', char)]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) chars have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_chars, 100. *
                        (c_found + c_lower + c_zeros) / n_chars)
                print('%i found directly, %i after lower, %i after zero.') % (
                    c_found, c_lower, c_zeros)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(char_input)
            input_test = (1 - dropout) * char_input
            char_input = T.switch(T.neq(is_train, 0), input_train,
                                  input_test)  # 条件句

        # LSTM for chars, first layer
        char_lstm_for1 = LSTM(char_dim,
                              char_hidden_dim,
                              with_batch=False,
                              name='first_char_lstm_for')
        char_lstm_rev1 = LSTM(char_dim,
                              char_hidden_dim,
                              with_batch=False,
                              name='first_char_lstm_rev')
        char_lstm_for1.link(char_input)  # char的顺序: l i k e
        char_lstm_rev1.link(char_input[::-1, :])  # 单词的顺序: e k i l
        char_for_output1 = char_lstm_for1.h
        char_rev_output1 = char_lstm_rev1.h[::-1, :]

        if char_bidirect:
            final_output = T.concatenate([char_for_output1, char_rev_output1],
                                         axis=1)
            tanh_layer1 = HiddenLayer(2 * char_hidden_dim,
                                      char_hidden_dim,
                                      name='tanh_layer1',
                                      activation='tanh')
            final_output = tanh_layer1.link(final_output)
        else:
            final_output = char_for_output1

        if layer2:
            #
            # Dropout on final input
            #
            if dropout:
                dropout_layer = DropoutLayer(p=dropout)
                input_train = dropout_layer.link(final_output)
                input_test = (1 - dropout) * final_output
                final_output = T.switch(T.neq(is_train, 0), input_train,
                                        input_test)  # 条件句

            # LSTM for chars, second layer
            char_lstm_for2 = LSTM(char_hidden_dim,
                                  layer2_hidden_dim,
                                  with_batch=False,
                                  name='second_char_lstm_for')
            char_lstm_rev2 = LSTM(char_hidden_dim,
                                  layer2_hidden_dim,
                                  with_batch=False,
                                  name='second_char_lstm_rev')
            char_lstm_for2.link(final_output)
            char_lstm_rev2.link(final_output[::-1, :])
            char_for_output2 = char_lstm_for2.h
            char_rev_output2 = char_lstm_rev2.h[::-1, :]

            if char_bidirect:
                final_output = T.concatenate(
                    [char_for_output2, char_rev_output2], axis=1)
                tanh_layer2 = HiddenLayer(2 * layer2_hidden_dim,
                                          layer2_hidden_dim,
                                          name='tanh_layer2',
                                          activation='tanh')
                final_output = tanh_layer2.link(final_output)
            else:
                final_output = char_for_output2

        if layer2:
            dims = layer2_hidden_dim
        else:
            dims = char_hidden_dim

        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis=1)
            dims = dims + n_tags

        # final_output = T.reshape(final_output, (-1, input_dim))

        # Sentence to Named Entity tags - Score,ci与CRF之间的隐含层
        final_layer = HiddenLayer(dims,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len),
                                          tag_ids].sum()  # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]].sum()  # A中对应元素的求和

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)

        self.add_component(char_lstm_for1)
        params.extend(char_lstm_for1.params)
        if char_bidirect:
            self.add_component(char_lstm_rev1)
            params.extend(char_lstm_rev1.params)

            self.add_component(tanh_layer1)
            params.extend(tanh_layer1.params)

        if layer2:
            self.add_component(char_lstm_for2)
            params.extend(char_lstm_for2.params)
            if char_bidirect:
                self.add_component(char_lstm_rev2)
                params.extend(char_lstm_rev2.params)

                self.add_component(tanh_layer2)
                params.extend(tanh_layer2.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)

        # Prepare train and eval inputs
        eval_inputs = []
        if char_dim:
            eval_inputs.append(char_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Beispiel #20
0
    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in range(2):
            if layer_type == "rcnn":
                l = RCNN(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)
            elif layer_type == "lstm":
                l = LSTM(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation)
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff_pre = (z[1:] - z[:-1]) * 1.0
        zdiff = T.sum(abs(zdiff_pre), axis=0, dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print("cost.dtype", cost.dtype)

        self.cost_e = loss * 10 + encoder.l2_cost
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
       
        cap_ids = T.ivector(name='cap_ids')

        # Sentence length
      
        # Final input (all word features)
        input_dim = 0
        inputs = []
        s_len = (char_pos_ids).shape[0]
        #
        #
        # Chars inputs
        #
    
        input_dim += (char_lstm_dim * 2)
        char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

        char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=False,
                             name='char_lstm_for')
        char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=False,
                             name='char_lstm_rev')

        char_lstm_for.link(char_layer.link(word_ids))
        char_lstm_rev.link(char_layer.link(cap_ids))
        
        
        final_layer = HiddenLayer(char_lstm_dim, n_chars, name='final_char_layer',
                              activation=('softmax'))
        chars_final = final_layer.link(char_lstm_for.h)
        
        final_rev_layer = HiddenLayer(char_lstm_dim, n_chars, name='final_char_rev_layer',
                              activation=('softmax'))
        chars_rev_final = final_layer.link(char_lstm_rev.h)
        
    
        cost_chars = T.nnet.categorical_crossentropy(chars_final, char_pos_ids).mean()
        cost_chars_rev = T.nnet.categorical_crossentropy(chars_rev_final, tag_ids).mean()
        
        # Network parameters
        params = []
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            self.add_component(char_lstm_rev)
            params.extend(char_lstm_rev.params)
        

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        #if cap_dim:
        
        eval_inputs.append(tag_ids)
        eval_inputs.append(cap_ids)
        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Fetch gradients from both char_lstms
        gradients = T.grad(cost_chars, char_lstm_for.params)
        gradients_rev = T.grad(cost_chars_rev, char_lstm_rev.params)
        
        # Return forward char_lstm grads        
        f_eval = theano.function(
            inputs=eval_inputs,
            outputs=gradients,
            givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='ignore'
        )
        
	# Return reverse char_lstm grads
        f_eval_rev = theano.function(
            inputs=eval_inputs,
            outputs=gradients_rev,
            givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='ignore'
        )

        return f_eval, f_eval_rev
Beispiel #22
0
    def ready(self):
        global total_generate_time
        #say("in generator ready: \n")
        #start_generate_time = time.time()
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)

            l = Layer(n_in=n_e, n_out=n_d, activation=sigmoid)

            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX).dimshuffle(
            (0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward(embs)
        h2 = layers[1].forward(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        #size = n_e

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)
        #probs = output_layer.forward(embs)
        #probs1 = probs.reshape(x.shape)

        #probs_rev = output_layer.forward(flipped_embs)
        #probs1_rev = probs.reshape(x.shape)

        #probs = T.concatenate([probs1, probs1_rev[::-1]], axis=2)

        # len*batch
        probs2 = probs.reshape(x.shape)
        if self.args.seed is not None:
            self.MRG_rng = MRG_RandomStreams(self.args.seed)
        else:
            self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2),
            theano.config.floatX)  #"int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        #self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        z2 = z_pred.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Beispiel #23
0
    def build(
            self,
            dropout,
            char_dim,
            char_hidden_dim,
            char_bidirect,
            word_dim,
            word_hidden_dim,
            word_bidirect,
            tagger_hidden_dim,
            hamming_cost,
            L2_reg,
            lr_method,
            pre_word_emb,
            pre_char_emb,
            tagger,
            use_gaze,
            POS,
            plot_cost,
            #cap_dim,
            training=True,
            **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        # n_pos = len(self.id_to_pos) + 1

        # Number of capitalization features
        #if cap_dim:
        #    n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')  # declare variable,声明整型变量is_train
        word_ids = T.ivector(name='word_ids')  #声明整型一维向量
        char_for_ids = T.imatrix(name='char_for_ids')  # 声明整型二维矩阵
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        if POS:
            # pos_ids = T.ivector(name='pos_ids')
            pos_one_hot = T.imatrix(name='pos_one_hot')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        tag_ids = T.ivector(name='tag_ids')
        #if cap_dim:
        #    cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]  #句子中的单词数

        # Final input (all word features)
        input_dim = 0
        inputs = []
        L2_norm = 0.0

        theano.config.compute_test_value = 'off'
        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_word_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained word embeddings from %s...' % pre_word_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_word_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid word embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', word)]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained word embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained word embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print('%i found directly, %i after lowercasing + zero.') % (
                    c_found, c_lower + c_zeros)
            L2_norm += (word_layer.embeddings**2).sum()

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_hidden_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_for_input = char_layer.link(char_for_ids)
            char_rev_input = char_layer.link(char_rev_ids)

            # Initialize with pretrained char embeddings
            if pre_char_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained char embeddings from %s...' % pre_char_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_char_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid char embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', char)]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained char embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained char embeddings.') % (
                        c_found + c_lower + c_zeros, n_chars, 100. *
                        (c_found + +c_lower + c_zeros) / n_chars)
                print('%i found directly, %i after lowercasing + zero.') % (
                    c_found, c_lower + c_zeros)
            L2_norm += (char_layer.embeddings**2).sum()

            char_lstm_for = LSTM(char_dim,
                                 char_hidden_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_hidden_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_for_input)
            char_lstm_rev.link(char_rev_input)

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            for param in char_lstm_for.params[:8]:
                L2_norm += (param**2).sum()

            if char_bidirect:
                char_lstm_hidden = T.concatenate(
                    [char_for_output, char_rev_output], axis=1)
                input_dim += char_hidden_dim
                for param in char_lstm_rev.params[:8]:
                    L2_norm += (param**2).sum()

            else:
                char_lstm_hidden = char_for_output

            inputs.append(char_lstm_hidden)

        # if POS:
        # pos_dim = 20
        # input_dim += pos_dim
        # pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer')
        # pos_input = pos_layer.link(pos_ids)
        # inputs.append(pos_input)
        # L2_norm += (pos_layer.embeddings ** 2).sum()

        #if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train,
                              input_test)  # 条件句

        # if POS:
        #     inputs = T.concatenate([inputs, pos_one_hot], axis= 1)
        #     input_dim += 6

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_hidden_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_hidden_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)  # 单词的顺序: I like dog
        word_lstm_rev.link(inputs[::-1, :])  # 单词的顺序: dog like I
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]

        for param in word_lstm_for.params[:8]:
            L2_norm += (param**2).sum()

        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)

            tanh_layer = HiddenLayer(2 * word_hidden_dim,
                                     word_hidden_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
            for param in word_lstm_rev.params[:8]:
                L2_norm += (param**2).sum()

        else:
            final_output = word_for_output

        dims = word_hidden_dim
        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis=1)
            dims = word_hidden_dim + n_tags

        if POS:
            final_output = T.concatenate([final_output, pos_one_hot], axis=1)
            dims += 6

        # if word_bidirect:
        #     final_output = T.concatenate(
        #         [word_for_output, word_rev_output],
        #         axis=1
        #     )
        #     tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim,
        #                              name='tanh_layer', activation='tanh')
        #     final_output = tanh_layer.link(final_output)
        # else:
        #     final_output = word_for_output

        # Sentence to Named Entity tags
        ## final_layer = HiddenLayer(dims, n_tags, name='final_layer',
        ##                           activation=(None if crf else 'softmax'))
        # final_layer = HiddenLayer(word_hidden_dim, n_tags, name='final_layer',
        #                           activation=(None if crf else 'softmax'))
        ## tags_scores = final_layer.link(final_output)
        ## L2_norm += (final_layer.params[0] ** 2).sum()

        # No CRF
        if tagger == 'lstm':
            tagger_layer = LSTM_d(dims,
                                  tagger_hidden_dim,
                                  with_batch=False,
                                  name='LSTM_d')
            tagger_layer.link(final_output)
            final_output = tagger_layer.t

            dims = tagger_hidden_dim

            for param in tagger_layer.params[:8]:
                L2_norm += (param**2).sum()

        final_layer = HiddenLayer(
            dims,
            n_tags,
            name='final_layer',
            activation=(None if tagger == 'crf' else 'softmax'))
        tags_scores = final_layer.link(final_output)
        L2_norm += (final_layer.params[0]**2).sum()

        if tagger != 'crf':
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len),
                                          tag_ids].sum()  # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]].sum()  # A中对应元素的求和
            all_paths_scores = forward(observations,
                                       transitions,
                                       hamming_cost=hamming_cost,
                                       n_tags=n_tags,
                                       padded_tags_ids=padded_tags_ids)
            L2_norm += (transitions**2).sum()
            cost = -(real_path_score - all_paths_scores) + L2_reg * L2_norm

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)
            self.add_component(char_lstm_for)
            params.extend(char_lstm_for.params)

            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        # if POS:
        #     self.add_component(pos_layer)
        #     params.extend(pos_layer.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)

        if tagger == 'lstm':
            self.add_component(tagger_layer)
            params.extend(tagger_layer.params)
        elif tagger == 'crf':
            self.add_component(transitions)
            params.append(transitions)

        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        if POS:
            # eval_inputs.append(pos_ids)
            eval_inputs.append(pos_one_hot)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        #if cap_dim:
        #    eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}),
                                      on_unused_input='warn')
        else:
            f_train = None

        if plot_cost:
            f_plot_cost = theano.function(inputs=train_inputs,
                                          outputs=cost,
                                          givens=({
                                              is_train: np.cast['int32'](1)
                                          } if dropout else {}),
                                          on_unused_input='warn')
        else:
            f_plot_cost = None

        # Compile evaluation function
        if tagger != 'crf':
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='warn')
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         hamming_cost=0,
                                         n_tags=None,
                                         padded_tags_ids=None,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='warn')

        return f_train, f_eval, f_plot_cost
Beispiel #24
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              pre_voc,
              crf,
              pos_dim,
              n_pos,
              training = 1,
              **kwargs
              ):
        """
        Build the network.
        """

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_y)
        n_cap = 2

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        cap_ids = T.ivector(name='cap_ids')
        if pos_dim:
            pos_ids = T.ivector(name='pos_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                emb_matrix = np.load(pre_emb)
                pre_w2idxs = dict([(w,i) for i,w in enumerate(np.load(pre_voc))])
                print pre_w2idxs.items()[:10]
                assert emb_matrix[0].shape[0] == word_dim
                for w in pre_w2idxs:
                    pretrained[w.lower()] = np.array(
                        [float(x) for x in emb_matrix[pre_w2idxs[w]]]).astype(np.float32)
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      ) 



        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Cue feature
        #

        input_dim += word_dim
        cap_layer = EmbeddingLayer(n_cap, word_dim, name='cap_layer')
        inputs.append(cap_layer.link(cap_ids))

        #
        # POS feature
        #

        if pos_dim:
            input_dim += word_dim
            pos_layer = EmbeddingLayer(n_pos, word_dim, name="pos_layer")
            inputs.append(pos_layer.link(pos_ids))

        # Prepare final input
        # if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        # Add cue layer (cap for the moment)
        self.add_component(cap_layer)
        params.extend(cap_layer.params)
        # Add pos tag layer
        if pos_dim:
	    self.add_component(pos_layer)
            params.extend(pos_layer.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        # add cue vector to the inputs
        eval_inputs.append(cap_ids)
        # add pos vector to the inputs
        if pos_dim:
            eval_inputs.append(pos_ids)

        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval
Beispiel #25
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)

            # Initialize with pretrained embeddings
            if pre_emb and training:
                
                # Randomly generates new weights
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                
                # Here is where we will substitute pyemblib read function.
                # Syntax: get_embedding_dict(emb_path, emb_format, first_n, vocab)
                emb_format = pyemblib2.Format.Word2Vec
                pretrained = get_embedding_dict(pre_emb, emb_format, 0, None)
                ''' 
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                '''
                
                c_found = 0
                c_lower = 0
                c_zeros = 0

                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                
                # This is it, this is what needs to be printed.
                # "word_layer.embeddings" is a "theano.shared" object 
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      )

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)

            # Supposedly the commented-out line below will stop
            # the model from updating the pretrained emeddings.
 
            # params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            
            # "params" supposedly contains the pretrained embedding matrix that we are updating. 
            # Find the "get_updates" function and figure out what it does.  
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
            #========================================
            # FUNCTION TO PRINT PRETRAINED EMBEDDINGS
            # The function below takes one argument, which it prints
            # along with the specified print message.
            print_matrix = T.dmatrix() 
            print_op = printing.Print('print message') 
            printed_x = print_op(print_matrix)
            f_print = function([print_matrix], printed_x) 
            #========================================
        else:
            f_train = None
            f_print = None

        # We return a tuple of things used to print the embedding so that it looks nicer. 
        print_tuple = [f_print, word_layer.embeddings]

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval, print_tuple