Beispiel #1
0
    def __init__(self, input_dim, output_dim, bias=True, activation='tanh',
                 name='hidden_layer'):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.bias = bias
        self.name = name
        if activation is None:
            self.activation = None
        elif activation == 'tanh':
            self.activation = T.tanh
        elif activation == 'sigmoid':
            self.activation = T.nnet.sigmoid
        elif activation == 'softmax':
            self.activation = T.nnet.softmax
        else:
            raise Exception("Unknown activation function: " % activation)

        # Initialize weights and bias
        self.weights = shared((input_dim, output_dim), name + '__weights')
        self.bias = shared((output_dim,), name + '__bias')

        # Define parameters
        if self.bias:
            self.params = [self.weights, self.bias]
        else:
            self.params = [self.weights]
Beispiel #2
0
    def __init__(self, input_dim, output_dim, name='embedding_layer'):
        """
        Typically, input_dim is the vocabulary size,
        and output_dim the embedding dimension.
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.name = name

        # Randomly generate weights
        self.embeddings = shared((input_dim, output_dim),
                                 self.name + '__embeddings')

        # Define parameters
        self.params = [self.embeddings]
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                # word_dim = 300 # for now making sure the if condition is satisfied to check if the code proceeds
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    # print " Line is ",line
                    # print "len(line) == word_dim + 1",len(line), word_dim + 1
                    # print "Word Embeddings are line[0:]....",line[0]
                    # print "Word Embeddings are line[1:]....",line[1:]
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Beispiel #4
0
    def build(
            self,
            dropout,
            char_dim,
            char_hidden_dim,
            char_bidirect,
            word_dim,
            word_hidden_dim,
            word_bidirect,
            tagger_hidden_dim,
            hamming_cost,
            L2_reg,
            lr_method,
            pre_word_emb,
            pre_char_emb,
            tagger,
            use_gaze,
            POS,
            plot_cost,
            #cap_dim,
            training=True,
            **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        # n_pos = len(self.id_to_pos) + 1

        # Number of capitalization features
        #if cap_dim:
        #    n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')  # declare variable,声明整型变量is_train
        word_ids = T.ivector(name='word_ids')  #声明整型一维向量
        char_for_ids = T.imatrix(name='char_for_ids')  # 声明整型二维矩阵
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        if POS:
            # pos_ids = T.ivector(name='pos_ids')
            pos_one_hot = T.imatrix(name='pos_one_hot')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        tag_ids = T.ivector(name='tag_ids')
        #if cap_dim:
        #    cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]  #句子中的单词数

        # Final input (all word features)
        input_dim = 0
        inputs = []
        L2_norm = 0.0

        theano.config.compute_test_value = 'off'
        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_word_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained word embeddings from %s...' % pre_word_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_word_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid word embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', word)]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained word embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained word embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print('%i found directly, %i after lowercasing + zero.') % (
                    c_found, c_lower + c_zeros)
            L2_norm += (word_layer.embeddings**2).sum()

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_hidden_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_for_input = char_layer.link(char_for_ids)
            char_rev_input = char_layer.link(char_rev_ids)

            # Initialize with pretrained char embeddings
            if pre_char_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained char embeddings from %s...' % pre_char_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_char_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid char embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', char)]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained char embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained char embeddings.') % (
                        c_found + c_lower + c_zeros, n_chars, 100. *
                        (c_found + +c_lower + c_zeros) / n_chars)
                print('%i found directly, %i after lowercasing + zero.') % (
                    c_found, c_lower + c_zeros)
            L2_norm += (char_layer.embeddings**2).sum()

            char_lstm_for = LSTM(char_dim,
                                 char_hidden_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_hidden_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_for_input)
            char_lstm_rev.link(char_rev_input)

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            for param in char_lstm_for.params[:8]:
                L2_norm += (param**2).sum()

            if char_bidirect:
                char_lstm_hidden = T.concatenate(
                    [char_for_output, char_rev_output], axis=1)
                input_dim += char_hidden_dim
                for param in char_lstm_rev.params[:8]:
                    L2_norm += (param**2).sum()

            else:
                char_lstm_hidden = char_for_output

            inputs.append(char_lstm_hidden)

        # if POS:
        # pos_dim = 20
        # input_dim += pos_dim
        # pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer')
        # pos_input = pos_layer.link(pos_ids)
        # inputs.append(pos_input)
        # L2_norm += (pos_layer.embeddings ** 2).sum()

        #if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train,
                              input_test)  # 条件句

        # if POS:
        #     inputs = T.concatenate([inputs, pos_one_hot], axis= 1)
        #     input_dim += 6

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_hidden_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_hidden_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)  # 单词的顺序: I like dog
        word_lstm_rev.link(inputs[::-1, :])  # 单词的顺序: dog like I
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]

        for param in word_lstm_for.params[:8]:
            L2_norm += (param**2).sum()

        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)

            tanh_layer = HiddenLayer(2 * word_hidden_dim,
                                     word_hidden_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
            for param in word_lstm_rev.params[:8]:
                L2_norm += (param**2).sum()

        else:
            final_output = word_for_output

        dims = word_hidden_dim
        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis=1)
            dims = word_hidden_dim + n_tags

        if POS:
            final_output = T.concatenate([final_output, pos_one_hot], axis=1)
            dims += 6

        # if word_bidirect:
        #     final_output = T.concatenate(
        #         [word_for_output, word_rev_output],
        #         axis=1
        #     )
        #     tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim,
        #                              name='tanh_layer', activation='tanh')
        #     final_output = tanh_layer.link(final_output)
        # else:
        #     final_output = word_for_output

        # Sentence to Named Entity tags
        ## final_layer = HiddenLayer(dims, n_tags, name='final_layer',
        ##                           activation=(None if crf else 'softmax'))
        # final_layer = HiddenLayer(word_hidden_dim, n_tags, name='final_layer',
        #                           activation=(None if crf else 'softmax'))
        ## tags_scores = final_layer.link(final_output)
        ## L2_norm += (final_layer.params[0] ** 2).sum()

        # No CRF
        if tagger == 'lstm':
            tagger_layer = LSTM_d(dims,
                                  tagger_hidden_dim,
                                  with_batch=False,
                                  name='LSTM_d')
            tagger_layer.link(final_output)
            final_output = tagger_layer.t

            dims = tagger_hidden_dim

            for param in tagger_layer.params[:8]:
                L2_norm += (param**2).sum()

        final_layer = HiddenLayer(
            dims,
            n_tags,
            name='final_layer',
            activation=(None if tagger == 'crf' else 'softmax'))
        tags_scores = final_layer.link(final_output)
        L2_norm += (final_layer.params[0]**2).sum()

        if tagger != 'crf':
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len),
                                          tag_ids].sum()  # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]].sum()  # A中对应元素的求和
            all_paths_scores = forward(observations,
                                       transitions,
                                       hamming_cost=hamming_cost,
                                       n_tags=n_tags,
                                       padded_tags_ids=padded_tags_ids)
            L2_norm += (transitions**2).sum()
            cost = -(real_path_score - all_paths_scores) + L2_reg * L2_norm

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)
            self.add_component(char_lstm_for)
            params.extend(char_lstm_for.params)

            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        # if POS:
        #     self.add_component(pos_layer)
        #     params.extend(pos_layer.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)

        if tagger == 'lstm':
            self.add_component(tagger_layer)
            params.extend(tagger_layer.params)
        elif tagger == 'crf':
            self.add_component(transitions)
            params.append(transitions)

        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        if POS:
            # eval_inputs.append(pos_ids)
            eval_inputs.append(pos_one_hot)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        #if cap_dim:
        #    eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}),
                                      on_unused_input='warn')
        else:
            f_train = None

        if plot_cost:
            f_plot_cost = theano.function(inputs=train_inputs,
                                          outputs=cost,
                                          givens=({
                                              is_train: np.cast['int32'](1)
                                          } if dropout else {}),
                                          on_unused_input='warn')
        else:
            f_plot_cost = None

        # Compile evaluation function
        if tagger != 'crf':
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='warn')
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         hamming_cost=0,
                                         n_tags=None,
                                         padded_tags_ids=None,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='warn')

        return f_train, f_eval, f_plot_cost
Beispiel #5
0
    def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'):
        """
        Initialize neural network.
        """
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.with_batch = with_batch
        self.name = name

        # Input gate weights
        self.w_xi = shared((input_dim, hidden_dim), name + '__w_xi')
        self.w_hi = shared((hidden_dim, hidden_dim), name + '__w_hi')
        self.w_ci = shared((hidden_dim, hidden_dim), name + '__w_ci')

        # Forget gate weights
        # self.w_xf = shared((input_dim, hidden_dim), name + '__w_xf')
        # self.w_hf = shared((hidden_dim, hidden_dim), name + '__w_hf')
        # self.w_cf = shared((hidden_dim, hidden_dim), name + '__w_cf')

        # Output gate weights
        self.w_xo = shared((input_dim, hidden_dim), name + '__w_xo')
        self.w_ho = shared((hidden_dim, hidden_dim), name + '__w_ho')
        self.w_co = shared((hidden_dim, hidden_dim), name + '__w_co')

        # Cell weights
        self.w_xc = shared((input_dim, hidden_dim), name + '__w_xc')
        self.w_hc = shared((hidden_dim, hidden_dim), name + '__w_hc')

        # Initialize the bias vectors, c_0 and h_0 to zero vectors
        self.b_i = shared((hidden_dim, ), name + '__b_i')
        # self.b_f = shared((hidden_dim,), name + '__b_f')
        self.b_c = shared((hidden_dim, ), name + '__b_c')
        self.b_o = shared((hidden_dim, ), name + '__b_o')
        self.c_0 = shared((hidden_dim, ), name + '__c_0')
        self.h_0 = shared((hidden_dim, ), name + '__h_0')

        # Define parameters
        self.params = [
            self.w_xi,
            self.w_hi,
            self.w_ci,
            # self.w_xf, self.w_hf, self.w_cf,
            self.w_xo,
            self.w_ho,
            self.w_co,
            self.w_xc,
            self.w_hc,
            self.b_i,
            self.b_c,
            self.b_o,  # self.b_f,
            self.c_0,
            self.h_0
        ]
Beispiel #6
0
    def ready(self):
        args = self.args
        w_emb_layer = self.w_emb_layer
        c_emb_layer = self.c_emb_layer
        r_emb_layers = self.r_emb_layers

        char_dim = self.char_dim = args.char_dim
        char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim
        word_dim = self.word_dim = args.word_dim
        word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        word_ids = self.word_ids = T.ivector('word_ids')
        char_ids = self.char_ids = T.imatrix('char_ids')
        char_lens = self.char_lens = T.fvector('char_lens')
        char_masks = self.char_masks = T.imatrix('char_masks')
        up_ids = self.up_ids = T.imatrix('up_ids')
        up_rels = self.up_rels = T.imatrix('up_rels')
        up_id_masks = self.up_id_masks = T.imatrix('up_id_masks')
        down_ids = self.down_ids = T.imatrix('down_ids')
        down_rels = self.down_rels = T.imatrix('down_rels')
        down_id_masks = self.down_id_masks = T.imatrix('down_id_masks')
        tag_ids = self.tag_ids = T.ivector('tag_ids')

        layers = self.layers = [w_emb_layer, c_emb_layer]
        layers.extend(r_emb_layers)

        inputs = self.inputs = []

        inputs.append(self.word_ids)
        inputs.append(self.char_ids)
        inputs.append(self.char_lens)
        inputs.append(self.char_masks)
        inputs.append(self.up_ids)
        inputs.append(self.up_rels)
        inputs.append(self.up_id_masks)
        inputs.append(self.down_ids)
        inputs.append(self.down_rels)
        inputs.append(self.down_id_masks)
        inputs.append(self.tag_ids)
        wslices = w_emb_layer.forward(word_ids)
        cslices = c_emb_layer.forward(char_ids.ravel())
        cslices = cslices.reshape(
            (char_ids.shape[0], char_ids.shape[1], char_dim))
        cslices = cslices.dimshuffle(1, 0, 2)

        bv_ur_slicess = []
        bv_dr_slicess = []
        b_ur_slicess = []
        b_dr_slicess = []

        for r_emb_layer in r_emb_layers:
            bv_ur_slices = r_emb_layer.forward(up_rels.ravel())
            bv_dr_slices = r_emb_layer.forward(down_rels.ravel())
            b_ur_slices = r_emb_layer.forward2(up_rels.ravel())
            b_dr_slices = r_emb_layer.forward2(down_rels.ravel())
            bv_ur_slicess.append(
                bv_ur_slices.reshape(
                    (up_rels.shape[0], up_rels.shape[1], word_dim)))
            bv_dr_slicess.append(
                bv_dr_slices.reshape(
                    (down_rels.shape[0], down_rels.shape[1], word_dim)))
            b_ur_slicess.append(
                b_ur_slices.reshape(
                    (up_rels.shape[0], up_rels.shape[1], word_dim)))
            b_dr_slicess.append(
                b_dr_slices.reshape(
                    (down_rels.shape[0], down_rels.shape[1], word_dim)))

        char_masks = char_masks.dimshuffle(1, 0)

        prev_output = wslices
        prev_size = word_dim

        if char_dim:
            layers.append(
                LSTM(n_in=char_dim,
                     n_out=char_lstm_dim,
                     direction='bi' if args.char_bidirect else 'si'))
            prev_output_2 = cslices
            prev_output_2 = apply_dropout(prev_output_2, dropout, v2=True)
            prev_output_2 = layers[-1].forward_all(cslices, char_masks)
            prev_output_2 = T.sum(prev_output_2, axis=0)
            prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) +
                                             char_lens).dimshuffle(0, 'x')

            prev_size += char_lstm_dim
            prev_output = T.concatenate([prev_output, prev_output_2], axis=1)

        #prev_output = apply_dropout(prev_output, dropout)
        prev_output = apply_dropout(prev_output, dropout)
        if args.conv != 0:
            for ind in range(args.clayer):
                layers.append(GraphCNN(
                    n_in=prev_size,
                    n_out=prev_size,
                ))
                residual = True
                if ind == 0:
                    residual = False
                prev_output = layers[-1].forward_all(prev_output,
                                                     up_ids,
                                                     up_id_masks,
                                                     bv_ur_slicess[ind],
                                                     b_ur_slicess[ind],
                                                     down_ids,
                                                     down_id_masks,
                                                     bv_dr_slicess[ind],
                                                     b_dr_slicess[ind],
                                                     residual=residual)
                prev_output = apply_dropout(prev_output, dropout)

        #prev_size *= 3
        layers.append(
            LSTM(n_in=prev_size,
                 n_out=word_lstm_dim,
                 direction='bi' if args.word_bidirect else 'si'))

        prev_output = prev_output.dimshuffle(0, 'x', 1)
        prev_output = layers[-1].forward_all(prev_output)
        prev_output = prev_output.reshape(
            (prev_output.shape[0], prev_output.shape[-1]))

        prev_size = word_lstm_dim

        layers.append(
            Layer(
                n_in=prev_size,
                n_out=args.classes,
                activation=linear,  #ReLU,
                has_bias=False))

        n_tags = args.classes
        s_len = char_ids.shape[0]
        tags_scores = layers[-1].forward(prev_output)
        transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
        small = -1000
        b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        observations = T.concatenate([tags_scores, small * T.ones((s_len, 2))],
                                     axis=1)

        observations = T.concatenate([b_s, observations, e_s], axis=0)

        real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
        b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)

        pre_ids = T.arange(s_len + 1)

        s_ids = T.arange(s_len + 1) + 1

        real_path_score += transitions[padded_tags_ids[pre_ids],
                                       padded_tags_ids[s_ids]].sum()

        all_paths_scores = CRFForward(observations, transitions)
        self.nll_loss = nll_loss = -(real_path_score - all_paths_scores)
        preds = CRFForward(observations,
                           transitions,
                           viterbi=True,
                           return_alpha=False,
                           return_best_sequence=True)

        self.pred = preds[1:-1]

        self.l2_sqr = None
        params = self.params = [transitions]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        #for l, i in zip(layers[3:], range(len(layers[3:]))):
        for l, i in zip(layers[2 + len(r_emb_layers):],
                        range(len(layers[2 + len(r_emb_layers):]))):
            say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out))

            #say("layer {}: n_in={}\tn_out={}\n".format(
            #    i, l.n_in, l.n_out
            #))

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))

        cost = self.nll_loss + self.l2_sqr

        lr_method_name = args.learning
        lr_method_parameters = {}
        lr_method_parameters['lr'] = args.learning_rate
        updates = Optimization(clip=5.0).get_updates(lr_method_name, cost,
                                                     params,
                                                     **lr_method_parameters)

        f_train = theano.function(inputs=self.inputs,
                                  outputs=[cost, nll_loss],
                                  updates=updates,
                                  allow_input_downcast=True)

        f_eval = theano.function(inputs=self.inputs[:-1],
                                 outputs=self.pred,
                                 allow_input_downcast=True)

        return f_train, f_eval
Beispiel #7
0
    def fprop(self, input, extra_input):
        cap_out = []
        for i in xrange(num_capsules):
            out, prob = self.capsules[i].fprop(input, extra_input)
            cap_out.append((out, prob))
        #prob_sum = sum([result[1] for result in cap_out])
        caps_out = sum([result[0] * result[1] for result in cap_out])
        shifted_img = T.nnet.sigmoid(caps_out + self.b_out)
        return shifted_img


if __name__ == "__main__":
    train, valid, test = load('mnist.pkl.gz')
    trans_train, shift_train, ori_train = translation(train[0], 28)
    trans_train, shift_train, ori_train = shared(
        (trans_train, shift_train, ori_train))
    trans_valid, shift_valid, ori_valid = translation(valid[0], 28)
    trans_valid, shift_valid, ori_valid = shared(
        (trans_valid, shift_valid, ori_valid))
    trans_test, shift_test, ori_test = translation(test[0], 28)
    trans_test, shift_test, ori_test = shared(
        (trans_test, shift_test, ori_test))

    num_capsules = 60
    in_dim = 784
    recog_dim = 10
    gener_dim = 20
    activation = 'sigmoid'

    input = T.matrix('input')
    extra_input = T.matrix('extra')
Beispiel #8
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              model_type,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        layer_weighting = "fixed"
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)

        print "-------------------------------MODEL INFO---------------------------------------"
        print "** model_type", model_type
        print "** n_words, n_chars:", n_words, n_chars
        print "** self.feature_maps:"
        for f in self.feature_maps:
            print f["name"], f
        print "** self.tag_maps:"
        for tm in self.tag_maps:
            print tm
        print "---------------------------------------------------------------------------------"

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')

        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')

        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        features_ids = []
        for f in self.feature_maps:
            features_ids.append(T.ivector(name=f['name'] + '_ids'))

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            print "** input_dim (input_dim += word_dim)", input_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            print "** input_dim (input_dim += char_lstm_dim)", input_dim

            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
                print "** input_dim (input_dim += char_lstm_dim: char_bidirect)", input_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            print "** input_dim (input_dim += cap_dim)", input_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        f_layers = []
        for ilayer in range(len(self.feature_maps)):
            f = self.feature_maps[ilayer]
            input_dim += f['dim']
            print "** input_dim (input_dim += f['dim'])", input_dim

            af_layer = EmbeddingLayer(len(f['id_to_ftag']),
                                      f['dim'],
                                      name=f['name'] + '_layer')
            f_layers.append(af_layer)
            inputs.append(af_layer.link(features_ids[ilayer]))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1)
        # inputs_nodropout = inputs

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        assert model_type in {
            "struct", "struct_mlp", "struct_mlp2", "multilayer", "single"
        }

        # Network parameters: Part 1 (Common parameters)
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)

        for af_layer in f_layers:
            self.add_component(af_layer)
            params.extend(af_layer.params)

        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)

        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)

        if model_type == "multilayer" or model_type == "single":
            tags_scores_list = []
            tag_ids_list = []
            cost_list = []

            observations_list = []
            transitions_list = []

            prev_input_dim = input_dim
            prev_ntags = 0
            prev_tags_cores = None
            previous_inputs = inputs

            for ilayer in range(len(self.tag_maps)):
                inputs_i = previous_inputs if prev_tags_cores == None else T.concatenate(
                    [previous_inputs, prev_tags_cores], axis=1)
                previous_inputs = inputs_i
                input_dim_i = prev_input_dim + prev_ntags
                print "input_dim_i for layer %d: %d" % (ilayer, input_dim_i)

                word_lstm_for_i = LSTM(input_dim_i,
                                       word_lstm_dim,
                                       with_batch=False,
                                       name='word_lstm_for' + str(ilayer))
                word_lstm_rev_i = LSTM(input_dim_i,
                                       word_lstm_dim,
                                       with_batch=False,
                                       name='word_lstm_rev' + str(ilayer))
                word_lstm_for_i.link(inputs_i)
                word_lstm_rev_i.link(inputs_i[::-1, :])
                word_for_output_i = word_lstm_for_i.h
                word_rev_output_i = word_lstm_rev_i.h[::-1, :]

                if word_bidirect:
                    final_output_i = T.concatenate(
                        [word_for_output_i, word_rev_output_i], axis=1)
                    tanh_layer_i = HiddenLayer(2 * word_lstm_dim,
                                               word_lstm_dim,
                                               name='tanh_layer' + str(ilayer),
                                               activation='tanh')
                    final_output_i = tanh_layer_i.link(final_output_i)
                else:
                    final_output_i = word_for_output_i

                n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])

                final_layer_i = HiddenLayer(
                    word_lstm_dim,
                    n_tags_i,
                    name='final_layer' + str(ilayer),
                    activation=(None if crf else 'softmax'))
                tags_scores_i = final_layer_i.link(final_output_i)
                tag_ids_i = T.ivector(name='tag_ids' +
                                      str(ilayer))  # input tags of layer i

                # No CRF
                if not crf:
                    cost_i = T.nnet.categorical_crossentropy(
                        tags_scores_i, tag_ids_i).mean()
                # CRF
                else:
                    transitions_i = shared((n_tags_i + 2, n_tags_i + 2),
                                           'transitions' + str(ilayer))
                    small1 = -1000
                    b_s1 = np.array([[small1] * n_tags_i + [0, small1]
                                     ]).astype(np.float32)

                    e_s1 = np.array([[small1] * n_tags_i + [small1, 0]
                                     ]).astype(np.float32)

                    observations_i = T.concatenate(
                        [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
                    observations_i = T.concatenate(
                        [b_s1, observations_i, e_s1], axis=0)

                    # Score from tags
                    real_path_score1 = tags_scores_i[T.arange(s_len),
                                                     tag_ids_i].sum()

                    # Score from transitions
                    b_id1 = theano.shared(
                        value=np.array([n_tags_i], dtype=np.int32))
                    e_id1 = theano.shared(
                        value=np.array([n_tags_i + 1], dtype=np.int32))
                    padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1],
                                                     axis=0)
                    real_path_score1 += transitions_i[
                        padded_tags_ids1[T.arange(s_len + 1)],
                        padded_tags_ids1[T.arange(s_len + 1) + 1]].sum()

                    all_paths_scores1 = forward(observations_i, transitions_i)

                    cost_i = -(real_path_score1 - all_paths_scores1)

                    observations_list.append(observations_i)
                    transitions_list.append(transitions_i)

                prev_input_dim = input_dim_i
                prev_ntags = n_tags_i
                prev_tags_cores = tags_scores_i * 1

                cost_list.append(cost_i)  # add cost of layer i into cost list
                tags_scores_list.append(tags_scores_i)
                tag_ids_list.append(tag_ids_i)

                # Network parameters: Part 2 (add parameters of mutilayer architectures)

                self.add_component(word_lstm_for_i)
                params.extend(word_lstm_for_i.params)  #1

                if word_bidirect:
                    self.add_component(word_lstm_rev_i)
                    params.extend(word_lstm_rev_i.params)  #2

                self.add_component(final_layer_i)
                params.extend(final_layer_i.params)  #3

                if crf:
                    self.add_component(transitions_i)
                    params.append(transitions_i)  #4

                if word_bidirect:
                    self.add_component(tanh_layer_i)
                    params.extend(tanh_layer_i.params)  #5

            # end for loop

        elif model_type == "struct" or model_type.startswith("struct_mlp"):
            # begin step 1: Using BI-LSTM to encode the sequence

            word_lstm_for = LSTM(input_dim,
                                 word_lstm_dim,
                                 with_batch=False,
                                 name='word_lstm_for')
            word_lstm_rev = LSTM(input_dim,
                                 word_lstm_dim,
                                 with_batch=False,
                                 name='word_lstm_rev')

            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_rev_output = word_lstm_rev.h[::-1, :]
            if word_bidirect:
                lstm_output = T.concatenate([word_for_output, word_rev_output],
                                            axis=1)
                tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                         word_lstm_dim,
                                         name='tanh_layer',
                                         activation='tanh')
                lstm_output = tanh_layer.link(lstm_output)
            else:
                lstm_output = word_for_output

            # end step 1: final_output is the list of hidden states. Shapes of hidden state is

            prev_ntags = 0
            tags_scores_list = []
            prev_tags_cores = None
            final_layer_list = []
            final_output = lstm_output
            mlp_list = []

            if model_type == "struct":
                for ilayer in range(0, len(self.tag_maps)):
                    n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                    final_output = final_output if prev_tags_cores == None else T.concatenate(
                        [final_output, prev_tags_cores], axis=1)
                    final_layer_i = HiddenLayer(
                        word_lstm_dim + prev_ntags,
                        n_tags_i,
                        name='final_layer_' + str(ilayer),
                        activation=(None if crf else 'softmax'))
                    tags_scores_i = final_layer_i.link(final_output)

                    prev_ntags += n_tags_i
                    prev_tags_cores = tags_scores_i
                    tags_scores_list.append(tags_scores_i)
                    final_layer_list.append(final_layer_i)
            elif model_type.startswith("struct_mlp"):

                for ilayer in range(0, len(self.tag_maps)):
                    n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                    final_output = final_output if prev_tags_cores == None else T.concatenate(
                        [final_output, prev_tags_cores], axis=1)

                    if model_type == "struct_mlp2":
                        mlp_sizes = [
                            word_lstm_dim + prev_ntags, word_lstm_dim,
                            word_lstm_dim
                        ]
                    else:
                        mlp_sizes = [word_lstm_dim + prev_ntags, word_lstm_dim]

                    mlp_input = final_output
                    for j in range(len(mlp_sizes) - 1):
                        mlp_layer = HiddenLayer(mlp_sizes[j],
                                                mlp_sizes[j + 1],
                                                name="mlp" + str(j + 1) +
                                                "_layer_" + str(ilayer),
                                                activation="tanh")
                        mlp_input = mlp_layer.link(mlp_input)
                        mlp_list.append(mlp_layer)
                    final_layer_i = HiddenLayer(
                        word_lstm_dim,
                        n_tags_i,
                        name='final_layer_' + str(ilayer),
                        activation=(None if crf else 'softmax'))
                    tags_scores_i = final_layer_i.link(mlp_input)

                    # # unroll version
                    # mlp1_layer_i = HiddenLayer(word_lstm_dim + prev_ntags, word_lstm_dim,
                    #                            name="mlp1_layer_" + str(ilayer), activation="tanh")
                    # mlp1_layer_i_out = mlp1_layer_i.link(final_output)
                    #
                    # mlp2_layer_i = HiddenLayer(word_lstm_dim, word_lstm_dim,
                    #                            name="mlp2_layer_" + str(ilayer), activation="tanh")
                    # mlp2_layer_i_out = mlp2_layer_i.link(mlp1_layer_i_out)
                    # mlp_list.append(mlp1_layer_i)
                    # mlp_list.append(mlp2_layer_i)
                    #
                    # final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer_' + str(ilayer),
                    #                             activation=(None if crf else 'softmax'))
                    # tags_scores_i = final_layer_i.link(mlp2_layer_i_out)

                    prev_ntags += n_tags_i
                    prev_tags_cores = tags_scores_i
                    tags_scores_list.append(tags_scores_i)
                    final_layer_list.append(final_layer_i)
            else:
                print(model_type, " is not exits !")
                raise

            # # unroll code
            # n_tags_0 = len(self.tag_maps[0]['id_to_tag'])
            # final_layer_0 = HiddenLayer(word_lstm_dim, n_tags_0, name='final_layer_0', activation=(None if crf else 'softmax'))
            # tags_scores_0 = final_layer_0.link(final_output)
            #
            # n_tags_1 = len(self.tag_maps[1]['id_to_tag'])
            # final_layer_1 = HiddenLayer(word_lstm_dim + n_tags_0, n_tags_1, name='final_layer_1', activation=(None if crf else 'softmax'))
            # final_output = T.concatenate( [final_output, tags_scores_0], axis=1 )
            # tags_scores_1 = final_layer_1.link(final_output)
            #
            # n_tags_2 = len(self.tag_maps[2]['id_to_tag'])
            # final_layer_2 = HiddenLayer(word_lstm_dim + n_tags_0 + n_tags_1, n_tags_2, name='final_layer_2',
            #                         activation=(None if crf else 'softmax'))
            # final_output = T.concatenate([final_output, tags_scores_1], axis=1)
            # tags_scores_2 = final_layer_2.link(final_output)
            # tags_scores_list = [tags_scores_0, tags_scores_1, tags_scores_2]

            tag_ids_list = []
            observations_list = []
            transitions_list = []
            cost_list = []

            for ilayer in range(0, len(self.tag_maps)):
                tag_ids_i = T.ivector(name='tag_ids' +
                                      str(ilayer))  # input tags
                tag_ids_list.append(tag_ids_i)
                tags_scores_i = tags_scores_list[ilayer]
                n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                # No CRF
                if not crf:
                    cost_i = T.nnet.categorical_crossentropy(
                        tags_scores_i, tag_ids_i).mean()
                # CRF
                else:
                    transitions_i = shared((n_tags_i + 2, n_tags_i + 2),
                                           'transitions' + str(ilayer))
                    small1 = -1000
                    b_s1 = np.array([[small1] * n_tags_i + [0, small1]
                                     ]).astype(np.float32)
                    e_s1 = np.array([[small1] * n_tags_i + [small1, 0]
                                     ]).astype(np.float32)
                    observations_i = T.concatenate(
                        [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
                    observations_i = T.concatenate(
                        [b_s1, observations_i, e_s1], axis=0)

                    # Score from tags
                    real_path_score1 = tags_scores_i[T.arange(s_len),
                                                     tag_ids_i].sum()

                    # Score from transitions
                    b_id1 = theano.shared(
                        value=np.array([n_tags_i], dtype=np.int32))
                    e_id1 = theano.shared(
                        value=np.array([n_tags_i + 1], dtype=np.int32))
                    padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1],
                                                     axis=0)
                    real_path_score1 += transitions_i[
                        padded_tags_ids1[T.arange(s_len + 1)],
                        padded_tags_ids1[T.arange(s_len + 1) + 1]].sum()

                    all_paths_scores1 = forward(observations_i, transitions_i)

                    cost_i = -(real_path_score1 - all_paths_scores1)

                    observations_list.append(observations_i)
                    transitions_list.append(transitions_i)

                cost_list.append(cost_i)  # add cost of layer i into cost list

            # Network parameters: Part 2 (add parameters of struct architectures)

            self.add_component(word_lstm_for)
            params.extend(word_lstm_for.params)

            if word_bidirect:
                self.add_component(word_lstm_rev)
                params.extend(word_lstm_rev.params)

            for mlp_layer in mlp_list:
                self.add_component(mlp_layer)
                params.extend(mlp_layer.params)

            for final_layer in final_layer_list:
                self.add_component(final_layer)
                params.extend(final_layer.params)

            # # unroll code
            # self.add_component(final_layer_0)
            # params.extend(final_layer_0.params)
            #
            # self.add_component(final_layer_1)
            # params.extend(final_layer_1.params)
            #
            # self.add_component(final_layer_2)
            # params.extend(final_layer_2.params)

            if crf:
                for transitions in transitions_list:
                    self.add_component(transitions)
                    params.append(transitions)

            if word_bidirect:
                self.add_component(tanh_layer)
                params.extend(tanh_layer.params)

        # elif model_type == "multilayer_original":
        #     print "** input_dim FOR LAYER 0 ", input_dim
        #     # LSTM for words
        #     word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
        #                          name='word_lstm_for')
        #     word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
        #                          name='word_lstm_rev')
        #
        #     word_lstm_for.link(inputs)
        #     word_lstm_rev.link(inputs[::-1, :])
        #     word_for_output = word_lstm_for.h
        #     word_rev_output = word_lstm_rev.h[::-1, :]
        #     if word_bidirect:
        #         final_output = T.concatenate(
        #             [word_for_output, word_rev_output],
        #             axis=1
        #         )
        #         tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
        #                                  name='tanh_layer', activation='tanh')
        #         final_output = tanh_layer.link(final_output)
        #     else:
        #         final_output = word_for_output
        #
        #     # Sentence to Named Entity tags - Score
        #     n_tags = len(self.tag_maps[0]['id_to_tag'])
        #
        #     final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
        #                               activation=(None if crf else 'softmax'))
        #     tags_scores = final_layer.link(final_output)
        #     tag_ids = T.ivector(name='tag_ids0')  # input tags of layer i
        #
        #     # No CRF
        #     if not crf:
        #         cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        #     # CRF
        #     else:
        #         transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
        #
        #         small = -1000
        #         b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        #         e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        #         observations = T.concatenate(
        #             [tags_scores, small * T.ones((s_len, 2))],
        #             axis=1
        #         )
        #         observations = T.concatenate(
        #             [b_s, observations, e_s],
        #             axis=0
        #         )
        #
        #         # Score from tags
        #         real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
        #
        #         # Score from transitions
        #         b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        #         e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        #         padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
        #         real_path_score += transitions[
        #             padded_tags_ids[T.arange(s_len + 1)],
        #             padded_tags_ids[T.arange(s_len + 1) + 1]
        #         ].sum()
        #
        #         all_paths_scores = forward(observations, transitions)
        #         cost = - (real_path_score - all_paths_scores)
        #
        #     print "cost: ", cost
        #     # Network parameters
        #
        #
        #     self.add_component(word_lstm_for)
        #     params.extend(word_lstm_for.params)  #1
        #
        #     if word_bidirect:
        #         self.add_component(word_lstm_rev)
        #         params.extend(word_lstm_rev.params)  #2
        #
        #     self.add_component(final_layer)
        #     params.extend(final_layer.params)  #3
        #
        #     if crf:
        #         self.add_component(transitions)
        #         params.append(transitions)  #4
        #
        #     if word_bidirect:
        #         self.add_component(tanh_layer)
        #         params.extend(tanh_layer.params)  #5
        #
        #     #
        #     #    layer 1 to n
        #     #
        #     tags_scores_list = [tags_scores]
        #     tag_ids_list = [tag_ids]
        #     cost_list = [cost]
        #     observations_list = [observations]
        #     transitions_list = [transitions]
        #     prev_input_dim = input_dim
        #     prev_ntags = n_tags
        #     prev_tags_cores = tags_scores * 1
        #
        #     for ilayer in range(1, len(self.tag_maps)):
        #         inputs_i = previous_inputs * 1
        #         inputs_i.append(prev_tags_cores)
        #         previous_inputs = inputs_i * 1
        #
        #         inputs_i = T.concatenate(inputs_i, axis=1)
        #         input_dim_i = prev_input_dim + prev_ntags
        #
        #         word_lstm_for_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_for' + str(ilayer))
        #         word_lstm_rev_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_rev' + str(ilayer))
        #         word_lstm_for_i.link(inputs_i)
        #         word_lstm_rev_i.link(inputs_i[::-1, :])
        #         word_for_output_i = word_lstm_for_i.h
        #         word_rev_output_i = word_lstm_rev_i.h[::-1, :]
        #
        #         if word_bidirect:
        #             final_output_i = T.concatenate(
        #                 [word_for_output_i, word_rev_output_i],
        #                 axis=1
        #             )
        #             tanh_layer_i = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
        #                                        name='tanh_layer' + str(ilayer), activation='tanh')
        #             final_output_i = tanh_layer_i.link(final_output_i)
        #         else:
        #             final_output_i = word_for_output_i
        #
        #         n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
        #
        #         final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer' + str(ilayer),
        #                                     activation=(None if crf else 'softmax'))
        #         tags_scores_i = final_layer_i.link(final_output_i)
        #         tags_scores_list.append(tags_scores_i)
        #         tag_ids_i = T.ivector(name='tag_ids' + str(ilayer))  # input tags
        #         tag_ids_list.append(tag_ids_i)
        #
        #         # No CRF
        #         if not crf:
        #             cost_i = T.nnet.categorical_crossentropy(tags_scores_i, tag_ids_i).mean()
        #         # CRF
        #         else:
        #             transitions_i = shared((n_tags_i + 2, n_tags_i + 2), 'transitions' + str(ilayer))
        #             small1 = -1000
        #             b_s1 = np.array([[small1] * n_tags_i + [0, small1]]).astype(np.float32)
        #             e_s1 = np.array([[small1] * n_tags_i + [small1, 0]]).astype(np.float32)
        #             observations_i = T.concatenate([tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
        #             observations_i = T.concatenate([b_s1, observations_i, e_s1], axis=0)
        #
        #             # Score from tags
        #             real_path_score1 = tags_scores_i[T.arange(s_len), tag_ids_i].sum()
        #
        #             # Score from transitions
        #             b_id1 = theano.shared(value=np.array([n_tags_i], dtype=np.int32))
        #             e_id1 = theano.shared(value=np.array([n_tags_i + 1], dtype=np.int32))
        #             padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1], axis=0)
        #             real_path_score1 += transitions_i[
        #                 padded_tags_ids1[T.arange(s_len + 1)],
        #                 padded_tags_ids1[T.arange(s_len + 1) + 1]
        #             ].sum()
        #
        #             all_paths_scores1 = forward(observations_i, transitions_i)
        #
        #             cost_i = - (real_path_score1 - all_paths_scores1)
        #
        #             observations_list.append(observations_i)
        #             transitions_list.append(transitions_i)
        #
        #         prev_input_dim = input_dim_i
        #         prev_ntags = n_tags_i
        #         prev_tags_cores = tags_scores_i * 1
        #         cost_list.append(cost_i)  # add cost of layer i into cost list
        #
        #         # add parameters
        #
        #         self.add_component(word_lstm_for_i)
        #         params.extend(word_lstm_for_i.params)
        #
        #         if word_bidirect:
        #             self.add_component(word_lstm_rev_i)
        #             params.extend(word_lstm_rev_i.params)
        #
        #         self.add_component(final_layer_i)
        #         params.extend(final_layer_i.params)
        #
        #         if crf:
        #             self.add_component(transitions_i)
        #             params.append(transitions_i)
        #
        #         if word_bidirect:
        #             self.add_component(tanh_layer_i)
        #             params.extend(tanh_layer_i.params)
        #
        #     # end for loop

        if layer_weighting == "fixed":
            if len(self.tag_maps) == 2:
                cost_weights = np.array([0.4, 0.6])
            elif len(self.tag_maps) == 3:
                cost_weights = np.array([0.4, 0.3, 0.3])
            else:
                cost_weights = np.ones(
                    (len(self.tag_maps), )) / len(self.tag_maps)
            costall = np.sum(cost_weights * np.array(cost_list))

        else:
            # https://groups.google.com/forum/#!topic/theano-users/XDG6MM83grI
            weights = np.ones((len(self.tag_maps), )) / len(self.tag_maps)
            cost_weights = theano.shared(weights.astype(theano.config.floatX),
                                         name="layer_weights")
            layer_weights = theano.tensor.nnet.sigmoid(cost_weights)
            params.extend([cost_weights])
            xx = theano.tensor.mul(layer_weights,
                                   theano.tensor.as_tensor_variable(cost_list))
            costall = theano.tensor.sum(xx)

        # Prepare train and eval inputs
        eval_inputs = []

        if word_dim:
            eval_inputs.append(word_ids)

        for ilayer in range(len(self.feature_maps)):
            eval_inputs.append(features_ids[ilayer])

        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)

        if cap_dim:
            eval_inputs.append(cap_ids)

        train_inputs = eval_inputs + tag_ids_list

        print "-- train_inputs: ",
        print train_inputs  # [word_ids, pos_ids, chunk_ids, wh_ids, if_ids, s_ids, tag_ids, tag_ids1, tag_ids2]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            # print "train_inputs[9]", train_inputs[9]
            print "-- len(cost_list): ", len(cost_list)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, costall, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=costall,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))

        else:
            f_train = None

        # Compile evaluation function
        tags_scores_out = tags_scores_list
        print "-- len(tags_scores_list): ", len(tags_scores_list)

        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores_out,
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {})  #,
                # on_unused_input='ignore'
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward_n(zip(observations_list, transitions_list),
                                  viterbi=True,
                                  return_alpha=False,
                                  return_best_sequence=True),
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {})  #,
                # on_unused_input='ignore'
            )

        from pprint import pprint
        print "--------------------------------------------------------------"
        pprint(self.components)

        return f_train, f_eval  # return f_train, f_eval, f_test
Beispiel #9
0
    def build(self):
#{{{
        super(AttentionLSTM,self).build()   ;
        self.W_A=shared((self.input_dim+self.output_dim,1),name='{}_W_A'.format(self.name));
        self.b_A=shared((1,),name='{}_b_A'.format(self.name));
        self.params+=[self.W_A,self.b_A];
        self.params.append(self.b_out)

    def fprop(self, input, extra_input):
        cap_out = []
        for i in xrange(num_capsules):
            out, prob = self.capsules[i].fprop(input, extra_input)
            cap_out.append((out, prob))
        #prob_sum = sum([result[1] for result in cap_out])
        caps_out = sum([result[0]*result[1] for result in cap_out])
        shifted_img = T.nnet.sigmoid(caps_out + self.b_out)
        return shifted_img

if __name__ == "__main__":
    train, valid, test = load('mnist.pkl.gz')
    trans_train, shift_train, ori_train = translation(train[0], 28)
    trans_train, shift_train, ori_train = shared((trans_train, shift_train, ori_train))
    trans_valid, shift_valid, ori_valid = translation(valid[0], 28)
    trans_valid, shift_valid, ori_valid = shared((trans_valid, shift_valid, ori_valid))
    trans_test, shift_test, ori_test = translation(test[0], 28)
    trans_test, shift_test, ori_test = shared((trans_test, shift_test, ori_test))

    num_capsules = 60
    in_dim = 784
    recog_dim = 10
    gener_dim = 20
    activation = 'sigmoid'

    input = T.matrix('input')
    extra_input = T.matrix('extra')
    output = T.matrix('output')
    transae = TransAE(num_capsules, in_dim, recog_dim, gener_dim, activation)
Beispiel #11
0
    def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM', lstm_type="CIFG"):

        if lstm_type not in {"vanilla", "CIFG"}:
            raise Exception("lstm_type must be: <vanilla> | <CIFG>")


        """
        Initialize neural network.
        """
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.with_batch = with_batch
        self.lstm_type = lstm_type
        self.name = name

        # Input gate weights
        self.w_xi = shared((input_dim, hidden_dim), name + '__w_xi')
        self.w_hi = shared((hidden_dim, hidden_dim), name + '__w_hi')
        self.w_ci = shared((hidden_dim, hidden_dim), name + '__w_ci')

        # Forget gate weights
        if lstm_type == "vanilla":
            self.w_xf = shared((input_dim, hidden_dim), name + '__w_xf')
            self.w_hf = shared((hidden_dim, hidden_dim), name + '__w_hf')
            self.w_cf = shared((hidden_dim, hidden_dim), name + '__w_cf')

        # Output gate weights
        self.w_xo = shared((input_dim, hidden_dim), name + '__w_xo')
        self.w_ho = shared((hidden_dim, hidden_dim), name + '__w_ho')
        self.w_co = shared((hidden_dim, hidden_dim), name + '__w_co')

        # Cell weights
        self.w_xc = shared((input_dim, hidden_dim), name + '__w_xc')
        self.w_hc = shared((hidden_dim, hidden_dim), name + '__w_hc')

        # Initialize the bias vectors, c_0 and h_0 to zero vectors
        self.b_i = shared((hidden_dim,), name + '__b_i')

        if lstm_type == "vanilla":
            self.b_f = shared((hidden_dim,), name + '__b_f')

        self.b_c = shared((hidden_dim,), name + '__b_c')
        self.b_o = shared((hidden_dim,), name + '__b_o')
        self.c_0 = shared((hidden_dim,), name + '__c_0')
        self.h_0 = shared((hidden_dim,), name + '__h_0')

        # Define parameters
        if lstm_type == "vanilla":
            self.params = [self.w_xi, self.w_hi, self.w_ci,
                           self.w_xf, self.w_hf, self.w_cf,
                           self.w_xo, self.w_ho, self.w_co,
                           self.w_xc, self.w_hc,
                           self.b_i, self.b_c, self.b_o,  self.b_f,
                           self.c_0, self.h_0]
        elif lstm_type == "CIFG":
            self.params = [self.w_xi, self.w_hi, self.w_ci,
                           self.w_xo, self.w_ho, self.w_co,
                           self.w_xc, self.w_hc,
                           self.b_i, self.b_c, self.b_o,
                           self.c_0, self.h_0]
    def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim,
              word_lstm_dim, word_bidirect, lr_method, lr_rate, clip_norm, crf,
              is_train, **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Network variables
        self.word_ids = tf.placeholder(
            tf.int32, shape=[None, None],
            name='word_ids')  # shape:[batch_size, max_word_len]
        self.word_pos_ids = tf.placeholder(
            tf.int32, shape=[None], name='word_pos_ids')  # shape: [batch_size]
        self.char_for_ids = tf.placeholder(
            tf.int32, shape=[None, None, None], name='char_for_ids'
        )  # shape: [batch_size, word_max_len, char_max_len]
        self.char_rev_ids = tf.placeholder(
            tf.int32, shape=[None, None, None], name='char_rev_ids'
        )  # shape: [batch_size, word_max_len, char_max_len]
        self.char_pos_ids = tf.placeholder(
            tf.int32, shape=[None, None], name='char_pos_ids'
        )  # shape: [batch_size*word_max_len, char_max_len]
        self.tag_ids = tf.placeholder(
            tf.int32, shape=[None, None],
            name='tag_ids')  # shape: [batch_size,word_max_len]
        self.tag_id_trans = tf.placeholder(
            tf.int32, shape=[None, None, None],
            name='tag_id_trans')  # shape: [batch_size,word_max_len+1,2]
        self.tag_id_index = tf.placeholder(
            tf.int32, shape=[None, None, None],
            name='tag_id_index')  # shape: [batch_size,word_max_len,2]
        # Final input (all word features)
        input_dim = 0
        inputs = []
        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            with tf.device("/cpu:0"):
                word_layer = EmbeddingLayer(n_words,
                                            word_dim,
                                            name='word_layer')
                word_input = word_layer.link(self.word_ids)
                inputs.append(word_input)

        #
        # Phars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            with tf.device("/cpu:0"):
                char_for_embedding_batch = char_layer.link(self.char_for_ids)
                char_rev_embedding_batch = char_layer.link(self.char_rev_ids)
            shape_for = tf.shape(char_for_embedding_batch)
            # reshape from [batch_size, word_max_len, char_max_len, char_dim] to [batch_size*word_max_len, char_max_len, char_dim]
            char_for_embedding = tf.reshape(
                char_for_embedding_batch,
                (shape_for[0] * shape_for[1], shape_for[2], shape_for[3]))
            shape_rev = tf.shape(char_rev_embedding_batch)
            char_rev_embedding = tf.reshape(
                char_rev_embedding_batch,
                (shape_rev[0] * shape_rev[1], shape_rev[2], shape_rev[3]))
            char_lstm_for_states = char_lstm_for.link(char_for_embedding)
            char_lstm_rev_states = char_lstm_rev.link(char_rev_embedding)
            char_lstm_for_h_trans = tf.transpose(char_lstm_for_states[1],
                                                 (1, 0, 2),
                                                 name='char_lstm_for_h_trans')
            char_lstm_rev_h_trans = tf.transpose(char_lstm_rev_states[1],
                                                 (1, 0, 2),
                                                 name='char_lstm_rev_h_trans')
            char_for_output = tf.gather_nd(char_lstm_for_h_trans,
                                           self.char_pos_ids,
                                           name='char_for_output')
            char_rev_output = tf.gather_nd(char_lstm_rev_h_trans,
                                           self.char_pos_ids,
                                           name='char_rev_output')
            char_for_output_batch = tf.reshape(
                char_for_output, (shape_for[0], shape_for[1], char_lstm_dim))
            char_rev_output_batch = tf.reshape(
                char_rev_output, (shape_rev[0], shape_rev[1], char_lstm_dim))
            inputs.append(char_for_output_batch)
            if char_bidirect:
                inputs.append(char_rev_output_batch)
                input_dim += char_lstm_dim
        inputs = tf.concat(inputs, axis=-1)
        # Dropout on final input
        assert dropout < 1 and 0.0 <= dropout
        if dropout:
            input_train = tf.nn.dropout(inputs, 1 - dropout)
            if is_train:
                inputs = input_train
        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=True,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=True,
                             name='word_lstm_rev')
        # fordword hidden output
        word_states_for = word_lstm_for.link(inputs)
        word_lstm_for_output = tf.transpose(word_states_for[1], (1, 0, 2),
                                            name='word_lstm_for_h_trans')

        # reverse hidden ouput
        inputs_rev = tf.reverse_sequence(inputs,
                                         self.word_pos_ids,
                                         seq_dim=1,
                                         batch_dim=0)
        word_states_rev = word_lstm_rev.link(inputs_rev)
        word_lstm_rev_h_trans = tf.transpose(word_states_rev[1], (1, 0, 2),
                                             name='word_lstm_rev_h_trans')
        word_lstm_rev_output = tf.reverse_sequence(word_lstm_rev_h_trans,
                                                   self.word_pos_ids,
                                                   seq_dim=1,
                                                   batch_dim=0)
        if word_bidirect:
            final_output = tf.concat(
                [word_lstm_for_output, word_lstm_rev_output], axis=-1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_lstm_for_output
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer')
        tags_scores = final_layer.link(final_output)
        # No CRF
        if not crf:
            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.tag_ids, logits=tags_scores, name='xentropy')
            cost = tf.reduce_mean(cross_entropy, name='xentropy_mean')
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)

            # for batch observation
            #def recurrence(prev, obs):
            #    s_len = tf.shape(obs)[0]
            #    obvs = tf.concat([obs, small * tf.ones((s_len, 2))], axis=1)
            #    observations = tf.concat([b_s, obvs, e_s], axis=0)
            #    return observations
            #tags_scores_shape = tf.shape(tags_scores)
            #obs_initial = tf.ones((tags_scores_shape[1] + 2, n_tags + 2))
            #obs_batch = tf.scan(fn=recurrence, elems=tags_scores, initializer=obs_initial)

            # Score from tags
            def recurrence_real_score(prev, obs):
                tags_score = obs[0]
                tag_id_index_ = obs[1]
                tag_id_trans_ = obs[2]
                word_pos_ = obs[3] + 1
                tags_score_slice = tags_score[0:word_pos_, :]
                tag_id_index_slice = tag_id_index_[0:word_pos_, :]
                tag_id_trans_slice = tag_id_trans_[0:(word_pos_ + 1), :]
                real_path_score = tf.reduce_sum(
                    tf.gather_nd(tags_score_slice, tag_id_index_slice))
                real_path_score += tf.reduce_sum(
                    tf.gather_nd(transitions, tag_id_trans_slice))
                return tf.reshape(real_path_score, [])

            real_path_score_list = tf.scan(fn=recurrence_real_score,
                                           elems=[
                                               tags_scores, self.tag_id_index,
                                               self.tag_id_trans,
                                               self.word_pos_ids
                                           ],
                                           initializer=0.0)

            def recurrence_all_path(prev, obs):
                tags_score = obs[0]
                word_pos_ = obs[1] + 1
                tags_score_slice = tags_score[0:word_pos_, :]
                s_len = tf.shape(tags_score_slice)[0]
                obvs = tf.concat(
                    [tags_score_slice, small * tf.ones((s_len, 2))], axis=1)
                observations = tf.concat([b_s, obvs, e_s], axis=0)
                all_paths_scores = forward(observations, transitions)
                return tf.reshape(all_paths_scores, [])

            all_paths_scores_list = tf.scan(
                fn=recurrence_all_path,
                elems=[tags_scores, self.word_pos_ids],
                initializer=0.0)
            cost = -tf.reduce_mean(real_path_score_list -
                                   all_paths_scores_list)
        # Network parameters
        if not crf:
            f_score = tf.nn.softmax(tags_scores)
        else:

            def recurrence_predict(prev, obs):
                tags_score = obs[0]
                word_pos_ = obs[1] + 1
                tags_score_slice = tags_score[0:word_pos_, :]
                s_len = tf.shape(tags_score_slice)[0]
                obvs = tf.concat(
                    [tags_score_slice, small * tf.ones((s_len, 2))], axis=1)
                observations = tf.concat([b_s, obvs, e_s], axis=0)
                all_paths_scores = forward(observations,
                                           transitions,
                                           viterbi=True,
                                           return_alpha=False,
                                           return_best_sequence=True)
                all_paths_scores = tf.concat([
                    all_paths_scores,
                    tf.zeros([tf.shape(tags_score)[0] - s_len], tf.int32)
                ],
                                             axis=0)
                return all_paths_scores

            f_score = tf.scan(fn=recurrence_predict,
                              elems=[tags_scores, self.word_pos_ids],
                              initializer=tf.zeros(
                                  [tf.shape(tags_scores)[1] + 2], tf.int32))
        # Optimization
        tvars = tf.trainable_variables()
        grads = tf.gradients(cost, tvars)
        if clip_norm > 0:
            grads, _ = tf.clip_by_global_norm(grads, clip_norm)

        if lr_method == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(lr_rate)
        elif lr_method == 'adagrad':
            optimizer = tf.train.AdagradOptimizer(lr_rate)
        elif lr_method == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(lr_rate)
        elif lr_method == 'adam':
            optimizer = tf.train.AdamOptimizer(lr_rate)
        elif lr_method == 'rmsprop':
            optimizer = tf.train.RMSPropOptimizer(lr_rate)
        else:
            raise ("Not implemented learning method: %s" % lr_method)

        train_op = optimizer.apply_gradients(zip(grads, tvars))

        return cost, f_score, train_op
Beispiel #13
0
    def build(self, parameters):
        #{{{
        """
        Build the network.
        """
        #some parameters
        dropout = parameters['dropout']
        char_dim = parameters['char_dim']
        char_lstm_dim = parameters['char_lstm_dim']
        char_bidirect = parameters['char_bidirect']
        word_dim = parameters['word_dim']
        word_lstm_dim = parameters['word_lstm_dim']
        word_bidirect = parameters['word_bidirect']
        lr_method = parameters['lr_method']
        pre_emb = parameters['pre_emb']
        crf = parameters['crf']
        cap_dim = parameters['cap_dim']
        training = parameters['training']
        features = parameters['features']

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        self.output_dim = len(self.id_to_tag)
        self.transitions = shared((self.output_dim + 1, self.output_dim),
                                  'transitions')

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        if features is not None and features['lemma']['isUsed']:
            lemma_ids = T.ivector(name='lemma_ids')
        if features is not None and features['pos']['isUsed']:
            pos_ids = T.ivector(name='pos_ids')
        if features is not None and features['chunk']['isUsed']:
            chunk_ids = T.ivector(name='chunk_ids')
        if features is not None and features['NER']['isUsed']:
            dic_ids = T.ivector(name='dic_ids')

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        # Word inputs
        #{{{
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            #for attention
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (
                        c_found, c_lower, c_zeros)  #}}}

        # Chars inputs
#{{{
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
#}}}

# Capitalization feature
#
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:

            #all_paths_scores = forward(observations, self.transitions)
            #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores)
            #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ;
            #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5);
            #cost=-error;
            #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2)

            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
            real_path_score += self.transitions[
                padded_tags_ids[T.arange(s_len)],
                padded_tags_ids[T.arange(s_len) + 1]].sum()

            all_paths_scores = forward(tags_scores, self.transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(self.transitions)
            params.append(self.transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            import optimizers
            self.optimizer = optimizers.RMSprop(lr=0.001)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            self.constraints = {}
            #updates = self.optimizer.get_updates(params,self.constraints,cost);
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
            #for debug
            #f_Debug = theano.function(
            #    inputs=train_inputs,
            #    outputs=cost,
            #    updates=self.update,
            #    givens=({is_train: np.cast['int32'](1)} if dropout else {})
            #)
            #debug end
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         tags_scores,
                                         self.transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Beispiel #14
0
    def build4(self, parameters):
        #{{{
        """
        Build the network.
        """
        #some parameters
        dropout = parameters['dropout']
        char_dim = parameters['char_dim']
        char_lstm_dim = parameters['char_lstm_dim']
        char_bidirect = parameters['char_bidirect']
        word_dim = parameters['word_dim']
        word_lstm_dim = parameters['word_lstm_dim']
        word_bidirect = parameters['word_bidirect']
        lr_method = parameters['lr_method']
        pre_emb = parameters['pre_emb']
        crf = parameters['crf']
        cap_dim = parameters['cap_dim']
        training = parameters['training']
        features = parameters['features']
        useAttend = parameters['useAttend']
        if useAttend:
            reloadParam = parameters['loading']
        else:
            reloadParam = None
        if reloadParam is not None:
            reloadPath = parameters['loading_path']
        sentencesLevelLoss = parameters['sentencesLevelLoss']

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        self.output_dim = len(self.id_to_tag)
        self.transitions = shared((self.output_dim + 1, self.output_dim),
                                  'transitions')

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        wordTrue_ids = T.ivector(name='wordTrue_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        docLen = T.ivector(name='docLen')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        #some features
        if features is not None and features['lemma']['isUsed']:
            lemma_ids = T.ivector(name='lemma_ids')
        if features is not None and features['pos']['isUsed']:
            pos_ids = T.ivector(name='pos_ids')
        if features is not None and features['chunk']['isUsed']:
            chunk_ids = T.ivector(name='chunk_ids')
        if features is not None and features['dic']['isUsed']:
            dic_ids = T.ivector(name='dic_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        # Word inputs
        #{{{
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            wordTrue_input = word_layer.link(wordTrue_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (
                        c_found, c_lower, c_zeros)  #}}}

        # Chars inputs
#{{{
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_output = T.concatenate([char_for_output, char_rev_output],
                                        axis=-1)
            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
#}}}

# Capitalization feature
#
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        #add feature
#{{{
        if features is not None and features['lemma']['isUsed']:
            lemma_layer = EmbeddingLayer(features['lemma']['num'],
                                         features['lemma']['dim'],
                                         name='lemma_layer')
            if features['lemma']['pre_emb'] is not "":
                new_weights = lemma_layer.embeddings.get_value()
                loadPreEmbFeatures(features['lemma']['pre_emb'],
                                   features['feature_to_id_map']['lemma'],
                                   new_weights,
                                   lower=True)
                lemma_layer.embeddings.set_value(new_weights)
            lemma_output = lemma_layer.link(lemma_ids)
            if features['lemma']['lstm-input']:
                input_dim += features['lemma']['dim']
                inputs.append(lemma_output)
        if features is not None and features['pos']['isUsed']:
            pos_layer = EmbeddingLayer(features['pos']['num'],
                                       features['pos']['dim'],
                                       name='pos_layer')
            if features['pos']['pre_emb'] is not "":
                new_weights = pos_layer.embeddings.get_value()
                loadPreEmbFeatures(features['pos']['pre_emb'],
                                   features['feature_to_id_map']['pos'],
                                   new_weights)
                pos_layer.embeddings.set_value(new_weights)
            pos_output = pos_layer.link(pos_ids)
            if features['pos']['lstm-input']:
                input_dim += features['pos']['dim']
                inputs.append(pos_output)
        if features is not None and features['chunk']['isUsed']:
            chunk_layer = EmbeddingLayer(features['chunk']['num'],
                                         features['chunk']['dim'],
                                         name='chunk_layer')
            chunk_output = chunk_layer.link(chunk_ids)
            if features['chunk']['lstm-input']:
                input_dim += features['chunk']['dim']
                inputs.append(chunk_output)
        if features is not None and features['dic']['isUsed']:
            dic_layer = EmbeddingLayer(features['dic']['num'],
                                       features['dic']['dim'],
                                       name='dic_layer')
            dic_output = dic_layer.link(dic_ids)
            if features['dic']['lstm-input']:
                input_dim += features['dic']['dim']
                inputs.append(dic_output)
#}}}

# Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        if sentencesLevelLoss:

            def sentLSTM(i, output, input, lenVec):
                #{{{
                Len = lenVec[i]
                accLen = lenVec[:i].sum()
                currentInput = input[accLen:accLen + Len]
                word_lstm_for.link(currentInput)
                word_lstm_rev.link(currentInput[::-1, :])
                wordForOutput = word_lstm_for.h
                wordRevOutput = word_lstm_rev.h[::-1, :]
                finalOutput = T.concatenate([wordForOutput, wordRevOutput],
                                            axis=-1)
                output = T.set_subtensor(output[accLen:accLen + Len],
                                         finalOutput)
                return output
    #}}}

            result, update = theano.scan(
                fn=sentLSTM,
                outputs_info=T.zeros((inputs.shape[0], word_lstm_dim * 2),
                                     dtype='float32'),
                sequences=[T.arange(docLen.shape[0])],
                non_sequences=[inputs, docLen])

            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_for_c = word_lstm_for.c
            word_rev_output = word_lstm_rev.h[::-1, :]
            word_rev_c = word_lstm_rev.c[::-1, :]

            final_c = T.concatenate([word_for_c, word_rev_c], axis=-1)
            final_output = result[-1]
        else:
            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_for_c = word_lstm_for.c
            word_rev_output = word_lstm_rev.h[::-1, :]
            word_rev_c = word_lstm_rev.c[::-1, :]
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=-1)
            final_c = T.concatenate([word_for_c, word_rev_c], axis=-1)

        if useAttend:
            #attention layer
            attended = []
            attendedDim = 0
            if features is not None and features['word']['attended']:
                attended.append(wordTrue_input)
                attendedDim += word_dim
            if features is not None and features['char']['attended']:
                attended.append(char_output)
                attendedDim += char_lstm_dim * 2
            if features is not None and features['lemma']['attended']:
                attended.append(lemma_output)
                attendedDim += features['lemma']['dim']
            if features is not None and features['pos']['attended']:
                attended.append(pos_output)
                attendedDim += features['pos']['dim']
            if features is not None and features['chunk']['attended']:
                attended.append(chunk_output)
                attendedDim += features['chunk']['dim']
            if features is not None and features['dic']['attended']:
                attended.append(dic_output)
                attendedDim += features['dic']['dim']

            attention_layer = AttentionLayer(
                attended_dim=attendedDim,
                state_dim=attendedDim,
                #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2,
                #                               state_dim=word_lstm_dim*2,
                source_dim=word_lstm_dim * 2,
                scoreFunName=parameters['attenScoreFun'],
                name='attention_layer')

            if len(attended) > 1:
                attendedInput = T.concatenate(attended, axis=-1)
            else:
                attendedInput = attended[0]

            final_output = attention_layer.link(attendedInput, attendedInput,
                                                final_output)
            #using lstm_state to compute attention
            #final_output=attention_layer.link(final_output,final_c,final_output);
            self.energy = attention_layer.energy
        else:
            final_output = final_output

        tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                 word_lstm_dim,
                                 name='tanh_layer',
                                 activation='tanh')
        final_output = tanh_layer.link(final_output)

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            if sentencesLevelLoss:
                #calcuate loss according to sentence instead of docLen
                def sentLoss(i, scores, trueIds, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentIds = trueIds[accLen:accLen + Len]
                    real_path_score = currentTagsScores[T.arange(Len),
                                                        currentIds].sum()
                    # Score from transitions
                    padded_tags_ids = T.concatenate([[n_tags], currentIds],
                                                    axis=0)
                    real_path_score += transitions[
                        padded_tags_ids[T.arange(Len)],
                        padded_tags_ids[T.arange(Len) + 1]].sum()

                    all_paths_scores = forward(currentTagsScores, transitions)
                    cost = -(real_path_score - all_paths_scores)
                    return cost

    #}}}

                result, update = theano.scan(
                    fn=sentLoss,
                    outputs_info=None,
                    sequences=[T.arange(docLen.shape[0])],
                    non_sequences=[
                        tags_scores, tag_ids, self.transitions, docLen
                    ])
                cost = result.sum()
            else:
                real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

                # Score from transitions
                padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
                real_path_score += self.transitions[
                    padded_tags_ids[T.arange(s_len)],
                    padded_tags_ids[T.arange(s_len) + 1]].sum()

                all_paths_scores = forward(tags_scores, self.transitions)
                cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(self.transitions)
            params.append(self.transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)
        #add feature layer
        if features is not None and features['lemma']['isUsed']:
            self.add_component(lemma_layer)
            params.extend(lemma_layer.params)
        if features is not None and features['pos']['isUsed']:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if features is not None and features['chunk']['isUsed']:
            self.add_component(chunk_layer)
            params.extend(chunk_layer.params)
        if features is not None and features['dic']['isUsed']:
            self.add_component(dic_layer)
            params.extend(dic_layer.params)

        if useAttend and reloadParam:
            #reload pre-train params
            model_path = self.model_path
            self.model_path = reloadPath
            print "loading:", self.model_path
            self.reload(features)
            self.model_path = model_path

        if useAttend:
            #add attention_layer
            self.add_component(attention_layer)
            params.extend(attention_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        if useAttend:
            eval_inputs.append(wordTrue_ids)
            if sentencesLevelLoss:
                eval_inputs.append(docLen)
        #add feature input
        if features is not None and features['lemma']['isUsed']:
            eval_inputs.append(lemma_ids)
        if features is not None and features['pos']['isUsed']:
            eval_inputs.append(pos_ids)
        if features is not None and features['chunk']['isUsed']:
            eval_inputs.append(chunk_ids)
        if features is not None and features['dic']['isUsed']:
            eval_inputs.append(dic_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            #constraints
            if useAttend:
                self.constraints = attention_layer.constraints
            else:
                self.constraints = {}
            from keras import optimizers
            self.optimizer = optimizers.SGD(lr=0.001,
                                            momentum=0.9,
                                            decay=0.,
                                            nesterov=True,
                                            clipvalue=5)
            self.optimizer = optimizers.RMSprop()
            #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name,
                cost,
                params,
                constraints=self.constraints,
                **lr_method_parameters)
            #updates = self.optimizer.get_updates(params,self.constraints,cost);
            f_train_outputs = [cost]
            if useAttend:
                f_train_outputs.append(self.energy)

            f_train = theano.function(inputs=train_inputs,
                                      outputs=f_train_outputs,
                                      updates=updates,
                                      on_unused_input='ignore',
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))

            f_test = theano.function(inputs=train_inputs,
                                     outputs=cost,
                                     on_unused_input='ignore',
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            self.f_test = f_test
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            if sentencesLevelLoss:

                def sentVitebe(i, predictTag, scores, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentPredictIds = forward(currentTagsScores,
                                                transitions,
                                                viterbi=True,
                                                return_alpha=False,
                                                return_best_sequence=True)
                    predictTag = T.set_subtensor(
                        predictTag[accLen:accLen + Len], currentPredictIds)
                    return predictTag
                    #}}}

                predictTag, update = theano.scan(
                    fn=sentVitebe,
                    outputs_info=T.zeros((tags_scores.shape[0], ),
                                         dtype='int32'),
                    sequences=[T.arange(docLen.shape[0])],
                    non_sequences=[tags_scores, self.transitions, docLen])
                predictTag = predictTag[-1]
            else:
                predictTag = forward(tags_scores,
                                     self.transitions,
                                     viterbi=True,
                                     return_alpha=False,
                                     return_best_sequence=True)
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=predictTag,
                                     on_unused_input='ignore',
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            #f_AttenVisual=theano.function(
            #    inputs=eval_inputs,
            #    outputs=[predictTag,self.energy],
            #    on_unused_input='ignore',
            #    givens=({is_train: np.cast['int32'](0)} if dropout else {})
            #    )
            #self.f_AttenVisual=f_AttenVisual;

        return f_train, f_eval
Beispiel #15
0
    def build(self):
#{{{
        self.W_A_X=shared((self.attended_dim,self.attended_dim),
                             name='{}_W_A_X'.format(self.name));
        self.b_A_X=shared((self.attended_dim,),
                            name='{}_W_A_b'.format(self.name));
        self.W_A_h=shared((self.attended_dim,self.attended_dim),
                             name='{}_W_A_h'.format(self.name));
        self.W_A_combine=shared((self.source_dim*2,
                                 self.source_dim),
                               name='{}_W_A_combine'.format(self.name));
        self.b_A_combine=shared((self.source_dim,),
                               name='{}_b_A_combine'.format(self.name))
        #self.W_A_combine=shared((self.source_dim,
        #                         self.source_dim),
        #                         name='{}_W_A_combine'.format(self.name));
        #self.b_A_combine=shared((self.source_dim,),
        #                         name='{}_b_A_combine'.format(self.name))
        #use constraint
        self.constraints={}
        
        self.params=[
                     self.W_A_X,self.b_A_X,
                    # self.W_A_h,
                     self.W_A_combine,self.b_A_combine
                    ];
        
        #for attention weight and score function
        if self.scoreFunName == "Euclidean":
#{{{
            self.W_A=shared((self.state_dim,),
                          name='{}_W_A'.format(self.name));
            self.W_A.set_value(np.ones((self.state_dim,),dtype=theano.config.floatX));
            self.constraints[self.W_A]=self.NonNegConstraint;
            self.scoreFun=self.euclideanScore;
            self.params.append(self.W_A);
#}}}
        elif self.scoreFunName == "Bilinear":
#{{{
            assert self.attended_dim==self.state_dim,"in Bilinear score function,"\
                " attended_dim must be equal to state_dim"
            self.W_A=self.init((self.state_dim,),
                                name="{}_W_A".format(self.name));
            self.scoreFun=self.bilinearScore;
            self.params.append(self.W_A);
#}}}
        elif self.scoreFunName == "forwardNN":
#{{{
            #this is two layer NN 
            #first layer (attended_dim+state_dim,state_dim);
            #second layer (state_dim,1);
            self.W_A=shared(((self.attended_dim+self.state_dim)\
                                *self.state_dim+self.state_dim,),
                                name="{}_W_A".format(self.name));
            self.scoreFun=self.forwardNNScore;
            self.params.append(self.W_A);
#}}}
        elif self.scoreFunName == "CNN":
#{{{
            #this if one layer CNN and pool layer;
            nb_filter=(self.attended_dim+self.state_dim)/2;
            filter_length=3;
            input_dim=self.attended_dim+self.state_dim;
            self.CNN1=Convolution1D(nb_filter=nb_filter,
                                   filter_length=filter_length,
                                  input_dim=input_dim,activation='tanh',
                                  border_mode='same');
            self.CNN2=Convolution1D(nb_filter=1,
                                   filter_length=filter_length,
                                  input_dim=nb_filter,activation='tanh',
                                  border_mode='same');
            self.W_A=self.CNN1.W;
            self.scoreFun=self.CNNScore;
            self.params.append(self.W_A);
            self.params.append(self.CNN2.W);
#}}}
        elif self.scoreFunName == "Cosine":
#{{{
            self.scoreFun=self.CosineScore;
            self.W_A=None;
#}}}
        elif self.scoreFunName == "Manhatten":
#{{{
            self.scoreFun=self.manhattenScore;
            self.W_A=self.one_init((self.state_dim,),
                          name='{}_W_A'.format(self.name));
            self.constraints[self.W_A]=self.NonNegConstraint;
            self.params.append(self.W_A);
#}}}
        else:
            assert 0, "we only have Euclidean, Bilinear, forwardNN"\
                    " score function for attention";
Beispiel #16
0
    def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'):
        """
        Initialize neural network.
        """
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.with_batch = with_batch
        self.name = name

        # Input gate weights
        self.w_xi = shared((input_dim, hidden_dim), name + '__w_xi')
        self.w_hi = shared((hidden_dim, hidden_dim), name + '__w_hi')
        self.w_ci = shared((hidden_dim, hidden_dim), name + '__w_ci')

        # Forget gate weights
        # self.w_xf = shared((input_dim, hidden_dim), name + '__w_xf')
        # self.w_hf = shared((hidden_dim, hidden_dim), name + '__w_hf')
        # self.w_cf = shared((hidden_dim, hidden_dim), name + '__w_cf')

        # Output gate weights
        self.w_xo = shared((input_dim, hidden_dim), name + '__w_xo')
        self.w_ho = shared((hidden_dim, hidden_dim), name + '__w_ho')
        self.w_co = shared((hidden_dim, hidden_dim), name + '__w_co')

        # Cell weights
        self.w_xc = shared((input_dim, hidden_dim), name + '__w_xc')
        self.w_hc = shared((hidden_dim, hidden_dim), name + '__w_hc')

        # Initialize the bias vectors, c_0 and h_0 to zero vectors
        self.b_i = shared((hidden_dim,), name + '__b_i')
        # self.b_f = shared((hidden_dim,), name + '__b_f')
        self.b_c = shared((hidden_dim,), name + '__b_c')
        self.b_o = shared((hidden_dim,), name + '__b_o')
        self.c_0 = shared((hidden_dim,), name + '__c_0')
        self.h_0 = shared((hidden_dim,), name + '__h_0')

        # Define parameters
        self.params = [self.w_xi, self.w_hi, self.w_ci,
                       # self.w_xf, self.w_hf, self.w_cf,
                       self.w_xo, self.w_ho, self.w_co,
                       self.w_xc, self.w_hc,
                       self.b_i, self.b_c, self.b_o,  # self.b_f,
                       self.c_0, self.h_0]
Beispiel #17
0
    def build(self,
              dropout,
              char_dim,
              char_hidden_dim,
              char_bidirect,
              layer2_hidden_dim,
              lr_method,
              layer2,
              batch_size,
              pre_emb,
              use_gaze,
              crf,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Network variables
        is_train = T.iscalar('is_train')  # declare variable,声明整型变量is_train
        char_ids = T.ivector(name='char_ids')  #声明整型一维向量
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        # tag_ids = T.imatrix(name='tag_ids')
        tag_ids = T.ivector(name='tag_ids')
        # Sentence length
        s_len = char_ids.shape[0]  #每个句子中的字数

        # Final input (all word features)
        #
        # Char inputs
        #
        if char_dim:
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_input = char_layer.link(char_ids)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif char.lower() in pretrained:
                        new_weights[i] = pretrained[char.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', char)]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) chars have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_chars, 100. *
                        (c_found + c_lower + c_zeros) / n_chars)
                print('%i found directly, %i after lower, %i after zero.') % (
                    c_found, c_lower, c_zeros)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(char_input)
            input_test = (1 - dropout) * char_input
            char_input = T.switch(T.neq(is_train, 0), input_train,
                                  input_test)  # 条件句

        # LSTM for chars, first layer
        char_lstm_for1 = LSTM(char_dim,
                              char_hidden_dim,
                              with_batch=False,
                              name='first_char_lstm_for')
        char_lstm_rev1 = LSTM(char_dim,
                              char_hidden_dim,
                              with_batch=False,
                              name='first_char_lstm_rev')
        char_lstm_for1.link(char_input)  # char的顺序: l i k e
        char_lstm_rev1.link(char_input[::-1, :])  # 单词的顺序: e k i l
        char_for_output1 = char_lstm_for1.h
        char_rev_output1 = char_lstm_rev1.h[::-1, :]

        if char_bidirect:
            final_output = T.concatenate([char_for_output1, char_rev_output1],
                                         axis=1)
            tanh_layer1 = HiddenLayer(2 * char_hidden_dim,
                                      char_hidden_dim,
                                      name='tanh_layer1',
                                      activation='tanh')
            final_output = tanh_layer1.link(final_output)
        else:
            final_output = char_for_output1

        if layer2:
            #
            # Dropout on final input
            #
            if dropout:
                dropout_layer = DropoutLayer(p=dropout)
                input_train = dropout_layer.link(final_output)
                input_test = (1 - dropout) * final_output
                final_output = T.switch(T.neq(is_train, 0), input_train,
                                        input_test)  # 条件句

            # LSTM for chars, second layer
            char_lstm_for2 = LSTM(char_hidden_dim,
                                  layer2_hidden_dim,
                                  with_batch=False,
                                  name='second_char_lstm_for')
            char_lstm_rev2 = LSTM(char_hidden_dim,
                                  layer2_hidden_dim,
                                  with_batch=False,
                                  name='second_char_lstm_rev')
            char_lstm_for2.link(final_output)
            char_lstm_rev2.link(final_output[::-1, :])
            char_for_output2 = char_lstm_for2.h
            char_rev_output2 = char_lstm_rev2.h[::-1, :]

            if char_bidirect:
                final_output = T.concatenate(
                    [char_for_output2, char_rev_output2], axis=1)
                tanh_layer2 = HiddenLayer(2 * layer2_hidden_dim,
                                          layer2_hidden_dim,
                                          name='tanh_layer2',
                                          activation='tanh')
                final_output = tanh_layer2.link(final_output)
            else:
                final_output = char_for_output2

        if layer2:
            dims = layer2_hidden_dim
        else:
            dims = char_hidden_dim

        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis=1)
            dims = dims + n_tags

        # final_output = T.reshape(final_output, (-1, input_dim))

        # Sentence to Named Entity tags - Score,ci与CRF之间的隐含层
        final_layer = HiddenLayer(dims,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len),
                                          tag_ids].sum()  # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]].sum()  # A中对应元素的求和

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)

        self.add_component(char_lstm_for1)
        params.extend(char_lstm_for1.params)
        if char_bidirect:
            self.add_component(char_lstm_rev1)
            params.extend(char_lstm_rev1.params)

            self.add_component(tanh_layer1)
            params.extend(tanh_layer1.params)

        if layer2:
            self.add_component(char_lstm_for2)
            params.extend(char_lstm_for2.params)
            if char_bidirect:
                self.add_component(char_lstm_rev2)
                params.extend(char_lstm_rev2.params)

                self.add_component(tanh_layer2)
                params.extend(tanh_layer2.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)

        # Prepare train and eval inputs
        eval_inputs = []
        if char_dim:
            eval_inputs.append(char_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Beispiel #18
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              pre_voc,
              crf,
              pos_dim,
              n_pos,
              training = 1,
              **kwargs
              ):
        """
        Build the network.
        """

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_y)
        n_cap = 2

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        cap_ids = T.ivector(name='cap_ids')
        if pos_dim:
            pos_ids = T.ivector(name='pos_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                emb_matrix = np.load(pre_emb)
                pre_w2idxs = dict([(w,i) for i,w in enumerate(np.load(pre_voc))])
                print pre_w2idxs.items()[:10]
                assert emb_matrix[0].shape[0] == word_dim
                for w in pre_w2idxs:
                    pretrained[w.lower()] = np.array(
                        [float(x) for x in emb_matrix[pre_w2idxs[w]]]).astype(np.float32)
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      ) 



        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Cue feature
        #

        input_dim += word_dim
        cap_layer = EmbeddingLayer(n_cap, word_dim, name='cap_layer')
        inputs.append(cap_layer.link(cap_ids))

        #
        # POS feature
        #

        if pos_dim:
            input_dim += word_dim
            pos_layer = EmbeddingLayer(n_pos, word_dim, name="pos_layer")
            inputs.append(pos_layer.link(pos_ids))

        # Prepare final input
        # if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        # Add cue layer (cap for the moment)
        self.add_component(cap_layer)
        params.extend(cap_layer.params)
        # Add pos tag layer
        if pos_dim:
	    self.add_component(pos_layer)
            params.extend(pos_layer.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        # add cue vector to the inputs
        eval_inputs.append(cap_ids)
        # add pos vector to the inputs
        if pos_dim:
            eval_inputs.append(pos_ids)

        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval
Beispiel #19
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)

            # Initialize with pretrained embeddings
            if pre_emb and training:
                
                # Randomly generates new weights
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                
                # Here is where we will substitute pyemblib read function.
                # Syntax: get_embedding_dict(emb_path, emb_format, first_n, vocab)
                emb_format = pyemblib2.Format.Word2Vec
                pretrained = get_embedding_dict(pre_emb, emb_format, 0, None)
                ''' 
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                '''
                
                c_found = 0
                c_lower = 0
                c_zeros = 0

                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                
                # This is it, this is what needs to be printed.
                # "word_layer.embeddings" is a "theano.shared" object 
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      )

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)

            # Supposedly the commented-out line below will stop
            # the model from updating the pretrained emeddings.
 
            # params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            
            # "params" supposedly contains the pretrained embedding matrix that we are updating. 
            # Find the "get_updates" function and figure out what it does.  
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
            #========================================
            # FUNCTION TO PRINT PRETRAINED EMBEDDINGS
            # The function below takes one argument, which it prints
            # along with the specified print message.
            print_matrix = T.dmatrix() 
            print_op = printing.Print('print message') 
            printed_x = print_op(print_matrix)
            f_print = function([print_matrix], printed_x) 
            #========================================
        else:
            f_train = None
            f_print = None

        # We return a tuple of things used to print the embedding so that it looks nicer. 
        print_tuple = [f_print, word_layer.embeddings]

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval, print_tuple