Esempio n. 1
0
    def __init__(self, config, ntags=None):

        # build input, directly feed with word embedding by the data generator
        word_input = Input(shape=(None, config.word_embedding_size), name='word_input')

        # build character based embedding
        char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
        char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
                                    output_dim=config.char_embedding_size,
                                    #mask_zero=True,
                                    #embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5),
                                    name='char_embeddings'
                                    ))(char_input)

        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)

        # length of sequence not used for the moment (but used for f1 communication)
        length_input = Input(batch_shape=(None, 1), dtype='int32', name='length_input')

        # combine characters and word embeddings
        x = Concatenate()([word_input, chars])
        x = Dropout(config.dropout)(x)

        x = Bidirectional(LSTM(units=config.num_word_lstm_units, 
                               return_sequences=True, 
                               recurrent_dropout=config.recurrent_dropout))(x)
        x = Dropout(config.dropout)(x)
        x = Dense(config.num_word_lstm_units, activation='tanh')(x)
        x = Dense(ntags)(x)
        self.crf = ChainCRF()
        pred = self.crf(x)

        self.model = Model(inputs=[word_input, char_input, length_input], outputs=[pred])
        self.config = config
Esempio n. 2
0
    def __init__(self, config, ntags=None):

        # build input, directly feed with word embedding by the data generator
        word_input = Input(shape=(None, config.word_embedding_size), name='word_input')

        # build character based embedding
        char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
        char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
                                    output_dim=config.char_embedding_size,
                                    mask_zero=True,
                                    #embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5),
                                    name='char_embeddings'
                                    ))(char_input)

        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units,
                                                   return_sequences=False)))(char_embeddings)

        # layout features input and embeddings
        features_input = Input(shape=(None, len(config.features_indices)), dtype='float32', name='features_input')

        # The input dimension is calculated by
        # features_vocabulary_size (default 12) * number_of_features + 1 (the zero is reserved for masking / padding)
        features_embedding = TimeDistributed(Embedding(input_dim=config.features_vocabulary_size * len(config.features_indices) + 1,
                                       output_dim=config.features_embedding_size,
                                       # mask_zero=True,
                                       trainable=False,
                                       name='features_embedding'), name="features_embedding_td")(features_input)

        features_embedding_bd = TimeDistributed(Bidirectional(LSTM(config.features_lstm_units, return_sequences=False)),
                                                 name="features_embedding_td_2")(features_embedding)

        features_embedding_out = Dropout(config.dropout)(features_embedding_bd)
        # length of sequence not used for the moment (but used for f1 communication)
        length_input = Input(batch_shape=(None, 1), dtype='int32', name='length_input')

        # combine characters and word embeddings
        x = Concatenate()([word_input, chars, features_embedding_out])
        x = Dropout(config.dropout)(x)

        x = Bidirectional(LSTM(units=config.num_word_lstm_units,
                               return_sequences=True,
                               recurrent_dropout=config.recurrent_dropout))(x)
        x = Dropout(config.dropout)(x)
        x = Dense(config.num_word_lstm_units, activation='tanh')(x)
        x = Dense(ntags)(x)
        self.crf = ChainCRF()
        pred = self.crf(x)

        self.model = Model(inputs=[word_input, char_input, features_input, length_input], outputs=[pred])
        self.config = config
Esempio n. 3
0
    def __init__(self, config, ntags=None):

        # build input, directly feed with word embedding by the data generator
        word_input = Input(shape=(None, config.word_embedding_size), name='word_input')

        # build character based embedding        
        char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
        char_embeddings = TimeDistributed(
                                Embedding(input_dim=config.char_vocab_size,
                                    output_dim=config.char_embedding_size,
                                    mask_zero=True,
                                    name='char_embeddings'
                                    ))(char_input)

        dropout = Dropout(config.dropout)(char_embeddings)

        conv1d_out = TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
        maxpool_out = TimeDistributed(GlobalMaxPooling1D())(conv1d_out)
        chars = Dropout(config.dropout)(maxpool_out)

        # custom features input and embeddings
        casing_input = Input(batch_shape=(None, None,), dtype='int32', name='casing_input')

        """
        casing_embedding = Embedding(input_dim=config.case_vocab_size, 
                           output_dim=config.case_embedding_size,
                           mask_zero=True,
                           trainable=False,
                           name='casing_embedding')(casing_input)
        casing_embedding = Dropout(config.dropout)(casing_embedding)
        """

        # length of sequence not used for the moment (but used for f1 communication)
        length_input = Input(batch_shape=(None, 1), dtype='int32')

        # combine words, custom features and characters
        x = Concatenate(axis=-1)([word_input, chars])
        x = Dropout(config.dropout)(x)

        x = Bidirectional(LSTM(units=config.num_word_lstm_units, 
                               return_sequences=True, 
                               recurrent_dropout=config.recurrent_dropout))(x)
        x = Dropout(config.dropout)(x)
        x = Dense(config.num_word_lstm_units, activation='tanh')(x)
        x = Dense(ntags)(x)
        self.crf = ChainCRF()
        pred = self.crf(x)

        self.model = Model(inputs=[word_input, char_input, casing_input, length_input], outputs=[pred])
        self.config = config
    def __init__(self, config: ModelConfig, ntags=None):
        super().__init__(
            config, ntags,
            require_casing=False, use_crf=True, supports_features=True,
            stateful=config.stateful
        )

        stateful = self.stateful
        # stateful RNNs require the batch size to be passed in
        input_batch_size = config.batch_size if stateful else None

        model_inputs = []
        lstm_inputs = []
        # build input, directly feed with word embedding by the data generator
        word_input = Input(
            shape=(None, config.word_embedding_size),
            batch_shape=(input_batch_size, None, config.word_embedding_size),
            name='word_input'
        )
        model_inputs.append(word_input)
        lstm_inputs.append(word_input)

        # build character based embedding
        char_input = Input(
            shape=(None, config.max_char_length),
            batch_shape=(input_batch_size, None, config.max_char_length),
            dtype='int32',
            name='char_input'
        )
        model_inputs.append(char_input)

        if config.char_embedding_size:
            assert config.char_vocab_size, 'config.char_vocab_size required'
            char_embeddings = TimeDistributed(Embedding(
                input_dim=config.char_vocab_size,
                output_dim=config.char_embedding_size,
                mask_zero=config.char_input_mask_zero,
                name='char_embeddings_embedding'
            ), name='char_embeddings')(char_input)

            chars = TimeDistributed(
                Bidirectional(LSTM(
                    config.num_char_lstm_units,
                    dropout=config.char_input_dropout,
                    recurrent_dropout=config.char_lstm_dropout,
                    return_sequences=False
                )),
                name='char_lstm'
            )(char_embeddings)
            lstm_inputs.append(chars)

        # length of sequence not used for the moment (but used for f1 communication)
        length_input = Input(batch_shape=(None, 1), dtype='int32', name='length_input')

        # combine characters and word embeddings
        LOGGER.debug('model, config.use_features: %s', config.use_features)
        if config.use_features:
            LOGGER.info('model using features')
            assert config.max_feature_size > 0
            features_input = Input(
                batch_shape=(input_batch_size, None, config.max_feature_size),
                name='features_input'
            )
            model_inputs.append(features_input)
            features = features_input
            if config.features_embedding_size:
                features = TimeDistributed(Dense(
                    config.features_embedding_size,
                    name='features_embeddings_dense'
                ), name='features_embeddings')(features)
            LOGGER.info(
                'word_input=%s, chars=%s, features=%s',
                word_input, chars, features
            )
            lstm_inputs.append(features)

        x = _concatenate_inputs(lstm_inputs, name='word_lstm_input')
        x = Dropout(config.dropout, name='word_lstm_input_dropout')(x)

        x = Bidirectional(LSTM(
            units=config.num_word_lstm_units,
            return_sequences=True,
            recurrent_dropout=config.recurrent_dropout,
            stateful=stateful,
        ), name='word_lstm')(x)
        x = Dropout(config.dropout, name='word_lstm_output_dropout')(x)
        x = Dense(
            config.num_word_lstm_units, name='word_lstm_dense', activation='tanh'
        )(x)
        x = Dense(ntags, name='dense_ntags')(x)
        self.crf = ChainCRF(name='crf')
        pred = self.crf(x)

        model_inputs.append(length_input)

        self.model = Model(inputs=model_inputs, outputs=[pred])
        self.config = config