Esempio n. 1
0
def cnn_rnn_tmp(nb_words, EMBEDDING_DIM, \
               embedding_matrix, MAX_SEQUENCE_LENGTH, \
               num_rnn, num_dense, rate_drop_rnn, \
               rate_drop_dense, act):
    '''
    This is the more complex cnn rnn model 

    model: input layer; embedding layer; more complex cnn based attention layer; rnn layer; dense layer; output layer
    '''
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    rnn_layer = Bidirectional(
        GRU(num_rnn, dropout=rate_drop_rnn, recurrent_dropout=rate_drop_rnn))
    cnn_layer = Conv1D(activation="relu",
                       padding="valid",
                       strides=1,
                       filters=32,
                       kernel_size=4)
    conv1 = Conv1D(filters=128,
                   kernel_size=1,
                   padding='same',
                   activation='relu')
    conv2 = Conv1D(filters=128,
                   kernel_size=2,
                   padding='same',
                   activation='relu')
    conv3 = Conv1D(filters=128,
                   kernel_size=3,
                   padding='same',
                   activation='relu')
    conv4 = Conv1D(filters=128,
                   kernel_size=4,
                   padding='same',
                   activation='relu')
    conv5 = Conv1D(filters=32,
                   kernel_size=5,
                   padding='same',
                   activation='relu')
    conv6 = Conv1D(filters=32,
                   kernel_size=6,
                   padding='same',
                   activation='relu')
    pooling_layer = GlobalMaxPooling1D()
    cnn_dense = Dense(300)
    cnn_dropout1 = Dropout(0.2)
    cnn_dropout2 = Dropout(0.2)
    cnn_batchnormalization = BatchNormalization()
    cnn_repeatvector = RepeatVector(EMBEDDING_DIM)
    cnn_dense1 = Dense(300)
    cnn_timedistributed = TimeDistributed(Dense(1))

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)

    conv1a = conv1(embedded_sequences_1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    glob1a = Dropout(0.5)(glob1a)
    glob1a = BatchNormalization()(glob1a)
    conv1b = conv1(embedded_sequences_2)
    glob1b = GlobalAveragePooling1D()(conv1b)
    glob1b = Dropout(0.5)(glob1b)
    glob1b = BatchNormalization()(glob1b)

    conv2a = conv2(embedded_sequences_1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    glob2a = Dropout(0.5)(glob2a)
    glob2a = BatchNormalization()(glob2a)
    conv2b = conv2(embedded_sequences_2)
    glob2b = GlobalAveragePooling1D()(conv2b)
    glob2b = Dropout(0.5)(glob2b)
    glob2b = BatchNormalization()(glob2b)

    conv3a = conv3(embedded_sequences_1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    glob3a = Dropout(0.5)(glob3a)
    glob3a = BatchNormalization()(glob3a)
    conv3b = conv3(embedded_sequences_2)
    glob3b = GlobalAveragePooling1D()(conv3b)
    glob3b = Dropout(0.5)(glob3b)
    glob3b = BatchNormalization()(glob3b)

    conv4a = conv4(embedded_sequences_1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    glob4a = Dropout(0.5)(glob4a)
    glob4a = BatchNormalization()(glob4a)
    conv4b = conv4(embedded_sequences_2)
    glob4b = GlobalAveragePooling1D()(conv4b)
    glob4b = Dropout(0.5)(glob4b)
    glob4b = BatchNormalization()(glob4b)

    conv5a = conv5(embedded_sequences_1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    glob5a = Dropout(0.5)(glob5a)
    glob5a = BatchNormalization()(glob5a)
    conv5b = conv5(embedded_sequences_2)
    glob5b = GlobalAveragePooling1D()(conv5b)
    glob5b = Dropout(0.5)(glob5b)
    glob5b = BatchNormalization()(glob5b)

    conv6a = conv6(embedded_sequences_1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    glob6a = Dropout(0.5)(glob6a)
    glob6a = BatchNormalization()(glob6a)
    conv6b = conv6(embedded_sequences_2)
    glob6b = GlobalAveragePooling1D()(conv6b)
    glob6b = Dropout(0.5)(glob6b)
    glob6b = BatchNormalization()(glob6b)

    cnn_1 = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    cnn_2 = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    cnn_1_t = cnn_dense1(cnn_1)
    cnn_2_t = cnn_dense1(cnn_2)

    a1 = multiply([cnn_1_t, embedded_sequences_1])
    a2 = multiply([cnn_2_t, embedded_sequences_2])

    a1 = Permute([2, 1])(a1)
    a2 = Permute([2, 1])(a2)

    a1 = Lambda(lambda x: K.sum(x, axis=1))(a1)
    a2 = Lambda(lambda x: K.sum(x, axis=1))(a2)

    a1 = Activation('sigmoid')(a1)
    a2 = Activation('sigmoid')(a2)

    embedded_sequences_1 = Permute([2, 1])(embedded_sequences_1)
    embedded_sequences_2 = Permute([2, 1])(embedded_sequences_2)

    x1 = multiply([a1, embedded_sequences_1])
    x2 = multiply([a2, embedded_sequences_2])

    x1 = Permute([2, 1])(x1)
    x2 = Permute([2, 1])(x2)

    x1 = rnn_layer(x1)
    x2 = rnn_layer(x2)

    merged = multiply([x1, x2])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(3, activation='softmax')(merged)

    ########################################
    ## train the model
    ########################################
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])
    model.summary()
    # print(STAMP)
    return model
Esempio n. 2
0
def cnn_rnn(nb_words, EMBEDDING_DIM, \
            embedding_matrix, MAX_SEQUENCE_LENGTH, \
            num_rnn, num_dense, rate_drop_rnn, \
            rate_drop_dense, act):
    '''
    This is the basic cnn rnn model 

    model: input layer; embedding layer; cnn based attention layer; rnn layer; dense layer; output layer
    '''

    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    rnn_layer = Bidirectional(
        GRU(num_rnn, dropout=rate_drop_rnn, recurrent_dropout=rate_drop_rnn))
    cnn_layer = Conv1D(activation="relu",
                       padding="valid",
                       strides=1,
                       filters=128,
                       kernel_size=2)
    # cnn_layer1 = Conv1D(activation="relu", padding="valid", strides=1, filters=64, kernel_size=4)
    pooling_layer = GlobalMaxPooling1D()
    cnn_dense = Dense(300)
    cnn_dropout1 = Dropout(0.35)
    cnn_dropout2 = Dropout(0.35)
    cnn_batchnormalization = BatchNormalization()
    cnn_repeatvector = RepeatVector(EMBEDDING_DIM)
    cnn_dense1 = Dense(300)
    cnn_timedistributed = TimeDistributed(Dense(1))

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)

    cnn_1 = cnn_layer(embedded_sequences_1)
    # cnn_1 = cnn_layer1(cnn_1)
    cnn_1 = pooling_layer(cnn_1)
    cnn_1 = cnn_dropout1(cnn_1)
    cnn_1 = cnn_dense(cnn_1)
    cnn_1 = cnn_dropout2(cnn_1)
    cnn_1 = cnn_batchnormalization(cnn_1)

    cnn_2 = cnn_layer(embedded_sequences_2)
    # cnn_2 = cnn_layer1(cnn_2)
    cnn_2 = pooling_layer(cnn_2)
    cnn_2 = cnn_dropout1(cnn_2)
    cnn_2 = cnn_dense(cnn_2)
    cnn_2 = cnn_dropout2(cnn_2)
    cnn_2 = cnn_batchnormalization(cnn_2)

    # cnn_1 = cnn_repeatvector(cnn_1)
    # cnn_2 = cnn_repeatvector(cnn_2)

    cnn_1_t = cnn_dense1(cnn_1)
    cnn_2_t = cnn_dense1(cnn_2)

    # cnn_1_t = cnn_timedistributed(cnn_1)
    # cnn_2_t = cnn_timedistributed(cnn_2)

    # cnn_1_t = Permute([2, 1])(cnn_1_t)
    # cnn_2_t = Permute([2, 1])(cnn_2_t)

    a1 = multiply([cnn_1_t, embedded_sequences_1])
    a2 = multiply([cnn_2_t, embedded_sequences_2])

    a1 = Permute([2, 1])(a1)
    a2 = Permute([2, 1])(a2)

    a1 = Lambda(lambda x: K.sum(x, axis=1))(a1)
    a2 = Lambda(lambda x: K.sum(x, axis=1))(a2)

    a1 = Activation('softmax')(a1)
    a2 = Activation('softmax')(a2)

    embedded_sequences_1 = Permute([2, 1])(embedded_sequences_1)
    embedded_sequences_2 = Permute([2, 1])(embedded_sequences_2)

    x1 = multiply([a1, embedded_sequences_1])
    x2 = multiply([a2, embedded_sequences_2])

    x1 = Permute([2, 1])(x1)
    x2 = Permute([2, 1])(x2)

    x1 = rnn_layer(x1)
    x2 = rnn_layer(x2)

    merged = multiply([x1, x2])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(3, activation='softmax')(merged)

    # x1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(embedded_sequences_1)
    # x1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(x1)

    # y1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(embedded_sequences_2)
    # y1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(y1)

    ########################################
    ## train the model
    ########################################
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])
    model.summary()
    # print(STAMP)
    return model
Esempio n. 3
0
    def __init__(self, title_word_length, content_word_length,
                 title_char_length, content_char_length, fs_btm_tw_cw_length,
                 fs_btm_tc_length, class_num, word_embedding_matrix,
                 char_embedding_matrix, optimizer_name, lr, metrics):
        # set attributes
        self.title_word_length = title_word_length
        self.content_word_length = content_word_length
        self.title_char_length = title_char_length
        self.content_char_length = content_char_length
        self.fs_btm_tw_cw_length = fs_btm_tw_cw_length
        self.fs_btm_tc_length = fs_btm_tc_length
        self.class_num = class_num
        self.word_embedding_matrix = word_embedding_matrix
        self.char_embedding_matrix = char_embedding_matrix
        self.optimizer_name = optimizer_name
        self.lr = lr
        self.metrics = metrics
        # Placeholder for input (title and content)
        title_word_input = Input(shape=(title_word_length, ),
                                 dtype='int32',
                                 name="title_word_input")
        cont_word_input = Input(shape=(content_word_length, ),
                                dtype='int32',
                                name="content_word_input")

        title_char_input = Input(shape=(title_char_length, ),
                                 dtype='int32',
                                 name="title_char_input")
        cont_char_input = Input(shape=(content_char_length, ),
                                dtype='int32',
                                name="content_char_input")

        # Embedding layer
        with K.tf.device("/cpu:0"):
            word_embedding_layer = Embedding(len(word_embedding_matrix),
                                             256,
                                             weights=[word_embedding_matrix],
                                             trainable=True,
                                             name='word_embedding')
            title_word_emb = word_embedding_layer(title_word_input)
            cont_word_emb = word_embedding_layer(cont_word_input)

            char_embedding_layer = Embedding(len(char_embedding_matrix),
                                             256,
                                             weights=[char_embedding_matrix],
                                             trainable=True,
                                             name='char_embedding')
            title_char_emb = char_embedding_layer(title_char_input)
            cont_char_emb = char_embedding_layer(cont_char_input)

        # Create a convolution + max pooling layer
        title_content_features = list()
        for win_size in range(1, 8):
            # batch_size x doc_len x embed_size
            title_content_features.append(
                GlobalMaxPooling1D()(Conv1D(100,
                                            win_size,
                                            activation='relu',
                                            padding='same')(title_word_emb)))
            title_content_features.append(
                GlobalMaxPooling1D()(Conv1D(100,
                                            win_size,
                                            activation='relu',
                                            padding='same')(cont_word_emb)))
            title_content_features.append(
                GlobalMaxPooling1D()(Conv1D(100,
                                            win_size,
                                            activation='relu',
                                            padding='same')(title_char_emb)))
            title_content_features.append(
                GlobalMaxPooling1D()(Conv1D(100,
                                            win_size,
                                            activation='relu',
                                            padding='same')(cont_char_emb)))

        # add btm_tw_cw features + btm_tc features
        fs_btm_tw_cw_input = Input(shape=(fs_btm_tw_cw_length, ),
                                   dtype='float32',
                                   name="fs_btm_tw_cw_input")
        fs_btm_tc_input = Input(shape=(fs_btm_tc_length, ),
                                dtype='float32',
                                name="fs_btm_tc_input")
        fs_btm_raw_features = concatenate(
            [fs_btm_tw_cw_input, fs_btm_tc_input])
        fs_btm_emb_features = Dense(
            1024, activation='relu',
            name='fs_btm_embedding')(fs_btm_raw_features)
        fs_btm_emb_features = Dropout(
            0.5, name='fs_btm_embedding_dropout')(fs_btm_emb_features)
        title_content_features.append(fs_btm_emb_features)

        title_content_features = concatenate(title_content_features)

        # Full connection
        title_content_features = Dense(
            3600, activation='relu',
            name='fs_embedding')(title_content_features)
        title_content_features = Dropout(
            0.5, name='fs_embedding_dropout')(title_content_features)

        # Prediction
        preds = Dense(class_num, activation='sigmoid',
                      name='prediction')(title_content_features)

        self._model = Model([
            title_word_input, cont_word_input, title_char_input,
            cont_char_input, fs_btm_tw_cw_input, fs_btm_tc_input
        ], preds)
        if 'rmsprop' == optimizer_name:
            optimizer = optimizers.RMSprop(lr=lr)
        elif 'adam' == optimizer_name:
            optimizer = optimizers.Adam(lr=lr,
                                        beta_1=0.9,
                                        beta_2=0.999,
                                        epsilon=1e-08)
        else:
            optimizer = None
        self._model.compile(loss=binary_crossentropy_sum,
                            optimizer=optimizer,
                            metrics=metrics)
        self._model.summary()
Esempio n. 4
0
                                           batch_size,
                                           vocabulary,
                                           skip_step=num_steps)

hidden_size = 500
use_dropout = True

#################################
# showing difference between time distributed and not time distributed
# return sequence and not return sequence.
# this site = key.
# https://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/

#'''
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=False))
if use_dropout:
    model.add(Dropout(0.5))
model.add(Dense(vocabulary))
model.add(Activation('softmax'))
#'''
'''
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
Esempio n. 5
0
y_one_hot_labels = np.asarray(y_one_hot)

#分割训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(X_seq,
                                                    y_one_hot_labels,
                                                    test_size=0.2)

num_words = 2000
vec_size = 128
output_shape = 2

#构建模型
data_input = Input(shape=[maxlen])
word_vec = Embedding(input_dim=num_words + 1,
                     input_length=maxlen,
                     output_dim=vec_size,
                     mask_zero=0,
                     name='Embedding')(data_input)
x = Conv1D(filters=128,
           kernel_size=[3],
           strides=1,
           padding='same',
           activation='relu')(word_vec)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(500, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(output_shape, activation='softmax')(x)
model = Model(inputs=data_input, outputs=x)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
'''
You've already prepared your sequences of text, with each of the sequences consisting of four words. It's time to build your LSTM model!

Your model will be trained on the first three words of each sequence, predicting the 4th one. You are going to use an Embedding layer that will essentially learn to turn words into vectors. These vectors will then be passed to a simple LSTM layer. Our output is a Dense layer with as many neurons as words in the vocabulary and softmax activation. This is because we want to obtain the highest probable next word out of all possible words.

The size of the vocabulary of words (the unique number of words) is stored in vocab_size.

'''


# Import the Embedding, LSTM and Dense layer
from keras.layers import Embedding, LSTM, Dense 

model = Sequential()

# Add an Embedding layer with the right parameters
model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=3))

# Add a 32 unit LSTM layer
model.add(LSTM(32))

# Add a hidden Dense layer of 32 units and an output layer of vocab_size with softmax
model.add(Dense(32, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()
Esempio n. 7
0
# Use same mean and stdev of embeddings the GloVe has when generating the random init.
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

# CNN model
model = Sequential()
model.add(
    Embedding(max_features,
              embed_size,
              weights=[embedding_matrix],
              trainable=False))
model.add(Conv1D(128, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_rate))
model.add(Dense(32, activation='relu'))
model.add(Dense(6, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001,
                       beta_1=0.9,
                       beta_2=0.999,
                       epsilon=1e-08,
                       decay=0.0)
Esempio n. 8
0
    def build(self):
        '''
        1. Build Code Representation Model
        '''
        logger.debug('Building Code Representation Model')
        methname = Input(shape=(self.data_params['methname_len'], ),
                         dtype='int32',
                         name='methname')
        apiseq = Input(shape=(self.data_params['apiseq_len'], ),
                       dtype='int32',
                       name='apiseq')
        tokens = Input(shape=(self.data_params['tokens_len'], ),
                       dtype='int32',
                       name='tokens')

        ## method name representation ##
        #1.embedding
        init_emb_weights = np.load(
            self.config['workdir'] +
            self.model_params['init_embed_weights_methname']
        ) if self.model_params[
            'init_embed_weights_methname'] is not None else None
        init_emb_weights = init_emb_weights if init_emb_weights is None else [
            init_emb_weights
        ]
        embedding = Embedding(
            input_dim=self.data_params['n_words'],
            output_dim=self.model_params.get('n_embed_dims', 100),
            weights=init_emb_weights,
            mask_zero=
            False,  #Whether 0 in the input is a special "padding" value that should be masked out. 
            #If set True, all subsequent layers in the model must support masking, otherwise an exception will be raised.
            name='embedding_methname')
        methname_embedding = embedding(methname)
        dropout = Dropout(0.25, name='dropout_methname_embed')
        methname_dropout = dropout(methname_embedding)
        #2.rnn
        f_rnn = LSTM(self.model_params.get('n_lstm_dims', 128),
                     recurrent_dropout=0.2,
                     return_sequences=True,
                     name='lstm_methname_f')

        b_rnn = LSTM(self.model_params.get('n_lstm_dims', 128),
                     return_sequences=True,
                     recurrent_dropout=0.2,
                     name='lstm_methname_b',
                     go_backwards=True)
        methname_f_rnn = f_rnn(methname_dropout)
        methname_b_rnn = b_rnn(methname_dropout)
        dropout = Dropout(0.25, name='dropout_methname_rnn')
        methname_f_dropout = dropout(methname_f_rnn)
        methname_b_dropout = dropout(methname_b_rnn)
        #3.maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False),
                         output_shape=lambda x: (x[0], x[2]),
                         name='maxpool_methname')
        methname_pool = Concatenate(name='concat_methname_lstms')(
            [maxpool(methname_f_dropout),
             maxpool(methname_b_dropout)])
        activation = Activation('tanh', name='active_methname')
        methname_repr = activation(methname_pool)

        ## API Sequence Representation ##
        #1.embedding
        embedding = Embedding(
            input_dim=self.data_params['n_words'],
            output_dim=self.model_params.get('n_embed_dims', 100),
            #weights=weights,
            mask_zero=
            False,  #Whether 0 in the input is a special "padding" value that should be masked out. 
            #If set True, all subsequent layers must support masking, otherwise an exception will be raised.
            name='embedding_apiseq')
        apiseq_embedding = embedding(apiseq)
        dropout = Dropout(0.25, name='dropout_apiseq_embed')
        apiseq_dropout = dropout(apiseq_embedding)
        #2.rnn
        f_rnn = LSTM(self.model_params.get('n_lstm_dims', 100),
                     return_sequences=True,
                     recurrent_dropout=0.2,
                     name='lstm_apiseq_f')
        b_rnn = LSTM(self.model_params.get('n_lstm_dims', 100),
                     return_sequences=True,
                     recurrent_dropout=0.2,
                     name='lstm_apiseq_b',
                     go_backwards=True)
        apiseq_f_rnn = f_rnn(apiseq_dropout)
        apiseq_b_rnn = b_rnn(apiseq_dropout)
        dropout = Dropout(0.25, name='dropout_apiseq_rnn')
        apiseq_f_dropout = dropout(apiseq_f_rnn)
        apiseq_b_dropout = dropout(apiseq_b_rnn)
        #3.maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False),
                         output_shape=lambda x: (x[0], x[2]),
                         name='maxpool_apiseq')
        apiseq_pool = Concatenate(name='concat_apiseq_lstms')(
            [maxpool(apiseq_f_dropout),
             maxpool(apiseq_b_dropout)])
        activation = Activation('tanh', name='active_apiseq')
        apiseq_repr = activation(apiseq_pool)

        ## Tokens Representation ##
        #1.embedding
        init_emb_weights = np.load(
            self.config['workdir'] +
            self.model_params['init_embed_weights_tokens']
        ) if self.model_params[
            'init_embed_weights_tokens'] is not None else None
        init_emb_weights = init_emb_weights if init_emb_weights is None else [
            init_emb_weights
        ]
        embedding = Embedding(
            input_dim=self.data_params['n_words'],
            output_dim=self.model_params.get('n_embed_dims', 100),
            weights=init_emb_weights,
            #mask_zero=True,#Whether 0 in the input is a special "padding" value that should be masked out.
            #If set True, all subsequent layers must support masking, otherwise an exception will be raised.
            name='embedding_tokens')
        tokens_embedding = embedding(tokens)
        dropout = Dropout(0.25, name='dropout_tokens_embed')
        tokens_dropout = dropout(tokens_embedding)

        #4.maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False),
                         output_shape=lambda x: (x[0], x[2]),
                         name='maxpool_tokens')
        tokens_pool = maxpool(tokens_dropout)
        activation = Activation('tanh', name='active_tokens')
        tokens_repr = activation(tokens_pool)

        ## concatenate the representation of code ##
        merged_methname_api = Concatenate(name='merge_methname_api')(
            [methname_repr, apiseq_repr])
        merged_code_repr = Concatenate(name='merge_coderepr')(
            [merged_methname_api, tokens_repr])
        code_repr = Dense(self.model_params.get('n_hidden', 400),
                          activation='tanh',
                          name='dense_coderepr')(merged_code_repr)

        self._code_repr_model = Model(inputs=[methname, apiseq, tokens],
                                      outputs=[code_repr],
                                      name='code_repr_model')
        print('\nsummary of code representation model')
        self._code_repr_model.summary()
        fname = self.config['workdir'] + 'models/' + self.model_params[
            'model_name'] + '/_code_repr_model.png'
        #plot_model(self._code_repr_model, show_shapes=True, to_file=fname)
        '''
        2. Build Desc Representation Model
        '''
        ## Desc Representation ##
        logger.debug('Building Desc Representation Model')
        desc = Input(shape=(self.data_params['desc_len'], ),
                     dtype='int32',
                     name='desc')
        #1.embedding
        init_emb_weights = np.load(
            self.config['workdir'] +
            self.model_params['init_embed_weights_desc']
        ) if self.model_params['init_embed_weights_desc'] is not None else None
        init_emb_weights = init_emb_weights if init_emb_weights is None else [
            init_emb_weights
        ]
        embedding = Embedding(
            input_dim=self.data_params['n_words'],
            output_dim=self.model_params.get('n_embed_dims', 100),
            weights=init_emb_weights,
            mask_zero=
            True,  #Whether 0 in the input is a special "padding" value that should be masked out. 
            #If set True, all subsequent layers must support masking, otherwise an exception will be raised.
            name='embedding_desc')
        desc_embedding = embedding(desc)
        dropout = Dropout(0.25, name='dropout_desc_embed')
        desc_dropout = dropout(desc_embedding)
        #2. rnn
        f_rnn = LSTM(self.model_params.get('n_lstm_dims', 100),
                     return_sequences=True,
                     recurrent_dropout=0.2,
                     name='lstm_desc_f')
        b_rnn = LSTM(self.model_params.get('n_lstm_dims', 100),
                     return_sequences=True,
                     recurrent_dropout=0.2,
                     name='lstm_desc_b',
                     go_backwards=True)
        desc_f_rnn = f_rnn(desc_dropout)
        desc_b_rnn = b_rnn(desc_dropout)
        dropout = Dropout(0.25, name='dropout_desc_rnn')
        desc_f_dropout = dropout(desc_f_rnn)
        desc_b_dropout = dropout(desc_b_rnn)
        #3. maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False),
                         output_shape=lambda x: (x[0], x[2]),
                         name='maxpool_desc')
        desc_pool = Concatenate(name='concat_desc_rnns')(
            [maxpool(desc_f_dropout),
             maxpool(desc_b_dropout)])
        activation = Activation('tanh', name='active_desc')
        desc_repr = activation(desc_pool)

        self._desc_repr_model = Model(inputs=[desc],
                                      outputs=[desc_repr],
                                      name='desc_repr_model')
        print('\nsummary of desc representation model')
        self._desc_repr_model.summary()
        fname = self.config['workdir'] + 'models/' + self.model_params[
            'model_name'] + '/_desc_repr_model.png'
        #plot_model(self._desc_repr_model, show_shapes=True, to_file=fname)
        """
        3: calculate the cosine similarity between code and desc
        """
        logger.debug('Building similarity model')
        code_repr = self._code_repr_model([methname, apiseq, tokens])
        desc_repr = self._desc_repr_model([desc])
        cos_sim = Dot(axes=1, normalize=True,
                      name='cos_sim')([code_repr, desc_repr])

        sim_model = Model(inputs=[methname, apiseq, tokens, desc],
                          outputs=[cos_sim],
                          name='sim_model')
        self._sim_model = sim_model  #for model evaluation
        print("\nsummary of similarity model")
        self._sim_model.summary()
        fname = self.config['workdir'] + 'models/' + self.model_params[
            'model_name'] + '/_sim_model.png'
        #plot_model(self._sim_model, show_shapes=True, to_file=fname)
        '''
        4:Build training model
        '''
        good_sim = sim_model(
            [self.methname, self.apiseq, self.tokens,
             self.desc_good])  # similarity of good output
        bad_sim = sim_model(
            [self.methname, self.apiseq, self.tokens,
             self.desc_bad])  #similarity of bad output
        loss = Lambda(lambda x: K.maximum(
            1e-6, self.model_params['margin'] - x[0] + x[1]),
                      output_shape=lambda x: x[0],
                      name='loss')([good_sim, bad_sim])

        logger.debug('Building training model')
        self._training_model = Model(inputs=[
            self.methname, self.apiseq, self.tokens, self.desc_good,
            self.desc_bad
        ],
                                     outputs=[loss],
                                     name='training_model')
        print('\nsummary of training model')
        self._training_model.summary()
        fname = self.config['workdir'] + 'models/' + self.model_params[
            'model_name'] + '/_training_model.png'
Esempio n. 9
0
##====================
#keras构建层的辅助函数
#定义切片操作
def slice(x, index):
    return x[:, :, index]


##===========================

#build base
#使用max pooling而不是k-max pooling,实验的结果证明,max pooling的效果略好于k-max pooling
print('Build model...')

main_input = Input(shape=(maxlen, ), dtype='int32')
embedding_map = Embedding(output_dim=embedding_dims,
                          input_dim=max_features,
                          input_length=maxlen,
                          W_regularizer=l2(reg_conf[0]))(main_input)

##
convs = []
for index in range(embedding_dims):
    #print ("i:",index)
    t = Lambda(slice,
               output_shape=(maxlen, 1),
               arguments={'index': index},
               name='slice_' + str(index + 1))(embedding_map)
    x = Reshape((maxlen, 1, 1))(t)  #(batch, height, width, channels)

    #第一层conv and pooling
    x = Convolution2D(m1,
                      w1,
print(len(input_test), 'test sequences')
print(input_train[0])

# Reverse sequences
# input_train = [x[::-1] for x in input_train]
# input_test = [x[::-1] for x in input_test]

print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print(input_train[0])
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(Bidirectional(LSTM(32)))
# model.add(LSTM(32))
# model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(input_train,
                    y_train,
                    epochs=10,
                    batch_size=128,
                    validation_split=0.2)

print(history.history)

acc = history.history['acc']
Esempio n. 11
0
def build_rnn_search_model(source_vacabuary_size,
                           source_embedding_dim,
                           source_initia_embedding,
                           encoder_rnn_output_dim_list,
                           attention_context_dim,
                           decoder_rnn_output_dim,
                           decoder_rnn_output_dropout_rate,
                           target_vacabuary_size,
                           target_embedding_dim,
                           target_initia_embedding,
                           decoder_hidden_unit_numbers,
                           decoder_hidden_unit_activation_functions,
                           optimizer='rmsprop',
                           beam_search_max_output_length,
                           beam_size,
                           weight_regularizer=None,
                           devices=None):
    # TODO: apply constraints

    source_word = Input((None, ), dtype='int32')
    source_word_mask = Input((None, ), dtype='int32')

    # source_word = trim_right_padding(source_word)
    source_embedding = Embedding(source_vacabuary_size,
                                 source_embedding_dim,
                                 weights=[source_initia_embedding],
                                 W_regularizer=weight_regularizer)

    encoder_output = source_embedding(source_word)

    # multiple bi-directional rnn layers
    for encoder_rnn_output_dim in encoder_rnn_output_dim_list:
        recurrent_left_to_right = GRU(encoder_rnn_output_dim,
                                      return_sequences=True,
                                      W_regularizer=weight_regularizer,
                                      U_regularizer=weight_regularizer,
                                      b_regularizer=weight_regularizer)
        recurrent_right_to_left = GRU(encoder_rnn_output_dim,
                                      return_sequences=True,
                                      go_backwards=True,
                                      W_regularizer=weight_regularizer,
                                      U_regularizer=weight_regularizer,
                                      b_regularizer=weight_regularizer)
        h1 = recurrent_left_to_right(encoder_output, source_word_mask)
        h2 = recurrent_right_to_left(encoder_output, source_word_mask)
        encoder_output = BiDirectionalLayer()([h1, h2])

    # the output of the last bi-directional RNN layer is the source context
    source_context = encoder_output
    # attention
    attention = AttentionLayer(attention_context_dim=attention_context_dim,
                               W_a_regularizer=weight_regularizer,
                               U_a_regularizer=weight_regularizer,
                               v_a_regularizer=weight_regularizer)

    # decoder
    decoder_input_sequence = Input((None, ),
                                   dtype='int32')  # starting with bos
    decoder_input_sequence_mask = Input((None, ), dtype='int32')

    decoder_rnn_cell = GRU(decoder_rnn_output_dim,
                           return_sequences=True,
                           W_regularizer=weight_regularizer,
                           U_regularizer=weight_regularizer,
                           b_regularizer=weight_regularizer)

    target_embedding = Embedding(target_vacabuary_size,
                                 target_embedding_dim,
                                 weights=[target_initia_embedding],
                                 W_regularizer=weight_regularizer)

    rnn_decoder = RNNDecoderLayer(decoder_rnn_cell, attention,
                                  target_embedding)

    rnn_decoder_output = rnn_decoder(
        [decoder_input_sequence, source_context],
        [decoder_input_sequence_mask, source_word_mask])
    rnn_decoder_output_dropout = Dropout(decoder_rnn_output_dropout_rate)
    rnn_decoder_output = rnn_decoder_output_dropout(rnn_decoder_output)

    mlp_classifier_hidden_layers = []
    for decoder_hidden_unit_number, decoder_hidden_unit_activation_function in zip(
            decoder_hidden_unit_numbers,
            decoder_hidden_unit_activation_functions):
        layer = Dense(decoder_hidden_unit_number,
                      activation=decoder_hidden_unit_activation_function,
                      W_regularizer=weight_regularizer,
                      b_regularizer=weight_regularizer)
        mlp_classifier_hidden_layers.append(layer)

    mlp_classifier_output_layer = Dense(output_dim=target_vacabuary_size,
                                        activation='softmax',
                                        W_regularizer=weight_regularizer,
                                        b_regularizer=weight_regularizer)

    mlp_classifier = MLPClassifierLayer(mlp_classifier_hidden_layers,
                                        mlp_classifier_output_layer)

    time_distributed_mlp_classifier = TimeDistributed(mlp_classifier)
    time_distributed_mlp_classifier_output = time_distributed_mlp_classifier(
        rnn_decoder_output, mask=decoder_input_sequence_mask)
    # output and its mask will will be used to generate proper loss function by the optimizer
    rnn_search_model = Model(input=[
        source_word, source_word_mask, decoder_input_sequence,
        decoder_input_sequence_mask
    ],
                             output=time_distributed_mlp_classifier_output)
    # training with multiple devices
    if devices:
        rnn_search_model = convert_to_model_with_parallel_training(
            rnn_search_model, devices)

    # TODO: try other loss, such as importance sampling based loss, e.g., sampled_softmax_loss (this will need to extend Keras model, which assumes that the loss function does not hold any trainable parameters
    rnn_search_model.compile(optimizer=optimizer,
                             loss='categorical_crossentropy',
                             metrics=['accuracy'])

    beam_search_initial_input = Input(get_shape=(1, ))
    rnn_decoder_with_beam_search = RNNDecoderLayerWithBeamSearch(
        beam_search_max_output_length, beam_size, decoder_rnn_cell, attention,
        target_embedding, mlp_classifier)

    beam_search_output_lattice = rnn_decoder_with_beam_search(
        [beam_search_initial_input, source_context])
    rnn_search_runtime_model = Model(
        input=[source_word, source_word_mask, beam_search_initial_input],
        output=beam_search_output_lattice)

    return (rnn_search_model, rnn_search_runtime_model)
Esempio n. 12
0
# convert text to int sequence
x_train1 = tokenizer.texts_to_sequences(x_train1)
x_train2 = tokenizer.texts_to_sequences(x_train2)

# max_sequence_len = 50 , as it gives a good measure
max_sequence_len = 50

# pad the sequences
x_train1 = pad_sequences(x_train1, maxlen=max_sequence_len, padding='pre')
x_train2 = pad_sequences(x_train2, maxlen=max_sequence_len, padding='pre')

# model - siamese lstm
inp1 = Input(shape=(max_sequence_len, ), name='sentence_1')
inp2 = Input(shape=(max_sequence_len, ), name='sentence_2')
emb = Embedding(output_dim=40,
                input_dim=vocab_len,
                input_length=max_sequence_len)
encoder = LSTM(80)
e1 = encoder(emb(inp1))
e2 = encoder(emb(inp2))
x = concatenate([e1, e2])
x = Dense(20, activation='relu')(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[inp1, inp2], outputs=out)
model.summary()

# compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras import optimizers


max_features = 26
embedding_size = 256
kernel_size = 5
filters = 250
pool_size = 2
lstm_output_size = 64



#print('Building model...')
model = Sequential()
model.add(Embedding(max_features, embedding_size))
model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size,padding ='valid',activation = 'relu',strides = 1))
model.add(MaxPooling1D(pool_size = pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss = 'binary_crossentropy',optimizer = optimizers.Adam(),metrics = ['acc'])


Esempio n. 14
0
def basic_cnn(nb_words, EMBEDDING_DIM, \
              embedding_matrix, MAX_SEQUENCE_LENGTH, \
              num_rnn, num_dense, rate_drop_rnn, \
              rate_drop_dense, act):
    '''
    This is the basic cnn model 

    model: input layer; embedding layer; several cnn layer; dense layer; output layer
    '''
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    conv1 = Conv1D(filters=128,
                   kernel_size=1,
                   padding='same',
                   activation='relu')
    conv2 = Conv1D(filters=128,
                   kernel_size=2,
                   padding='same',
                   activation='relu')
    conv3 = Conv1D(filters=128,
                   kernel_size=3,
                   padding='same',
                   activation='relu')
    conv4 = Conv1D(filters=128,
                   kernel_size=4,
                   padding='same',
                   activation='relu')
    conv5 = Conv1D(filters=32,
                   kernel_size=5,
                   padding='same',
                   activation='relu')
    conv6 = Conv1D(filters=32,
                   kernel_size=6,
                   padding='same',
                   activation='relu')

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)

    conv1a = conv1(embedded_sequences_1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(embedded_sequences_2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(embedded_sequences_1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(embedded_sequences_2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(embedded_sequences_1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(embedded_sequences_2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(embedded_sequences_1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(embedded_sequences_2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(embedded_sequences_1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(embedded_sequences_2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(embedded_sequences_1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(embedded_sequences_2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]),
                  output_shape=(4 * 128 + 2 * 32, ))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1],
                 output_shape=(4 * 128 + 2 * 32, ))([mergea, mergeb])

    merge = concatenate([diff, mul])

    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    preds = Dense(3, activation='softmax')(x)

    ########################################
    ## train the model
    ########################################
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])
    model.summary()
    # print(STAMP)
    return model
                                                    test_size=0.05,
                                                    random_state=1)
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# w2v_weight.shape, embedding_size, max_sentence_length

# Metric
from modules import recall, precision, f1score

# Building Network

# CNN
model = Sequential()
model.add(
    Embedding(vocab_size,
              embedding_size,
              input_length=max_sentence_length,
              weights=[w2v_weight]))
model.add(Conv1D(filters=256, kernel_size=5, activation='relu'))
model.add(Dropout(0.2))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(98))
model.add(BatchNormalization())
model.add(ReLU())
# model.add(GlobalMaxPooling1D())
model.add(Dense(98, activation="softmax"))
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["acc", precision, recall, f1score])
model.summary()
Esempio n. 16
0
def basic_attention(nb_words, EMBEDDING_DIM, \
                    embedding_matrix, MAX_SEQUENCE_LENGTH, \
                    num_rnn, num_dense, rate_drop_rnn, \
                    rate_drop_dense, act):
    '''
    This is the basic attention model 

    model: input layer; embedding layer; rnn layer; attention layer; dense layer; output layer
    '''
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    rnn_layer = Bidirectional(
        GRU(num_rnn,
            dropout=rate_drop_rnn,
            recurrent_dropout=rate_drop_rnn,
            return_sequences=True))
    attention_W = TimeDistributed(Dense(350, activation='tanh'))
    attention_w = TimeDistributed(Dense(1))
    attention_softmax = Activation('softmax')
    attention_sum = Lambda(lambda x: K.sum(x, axis=1))

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = rnn_layer(embedded_sequences_1)

    attention1 = attention_W(x1)
    attention1 = attention_w(attention1)
    attention1 = attention_softmax(attention1)
    attention1 = Permute([2, 1])(attention1)
    x1 = Permute([2, 1])(x1)
    x1 = multiply([attention1, x1])
    x1 = Permute([2, 1])(x1)
    x1 = attention_sum(x1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    x2 = rnn_layer(embedded_sequences_2)

    attention2 = attention_W(x2)
    attention2 = attention_w(attention2)
    attention2 = attention_softmax(attention2)
    attention2 = Permute([2, 1])(attention2)
    x2 = Permute([2, 1])(x2)
    x2 = multiply([attention2, x2])
    x2 = Permute([2, 1])(x2)
    x2 = attention_sum(x2)

    merged = multiply([x1, x2])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(3, activation='softmax')(merged)

    ########################################
    ## train the model
    ########################################
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])
    model.summary()
    # print(STAMP)
    return model
X = numpy.array(id_data)
Y = numpy.array(labels)

histories = []
accu = []

for trainidx, testidx in kf.split(X):
    print(trainidx)
    print(testidx)
    train_data, train_labels, test_data, test_labels = X[trainidx], Y[
        trainidx], X[testidx], Y[testidx]
    train_data = sequence.pad_sequences(train_data, maxlen=param["max_len"])
    test_data = sequence.pad_sequences(test_data, maxlen=param["max_len"])

    embedding_layer = Embedding(output_dim=vocab_dim,
                                input_dim=n_symbols,
                                trainable=False)
    embedding_layer.build(
        (None, ))  # if you don't do this, the next step won't work
    embedding_layer.set_weights([embedding_weights])

    param = {
        "max_len": 64,
        "batch_size": 32,  #16?
        "embed_dims": 128,
        "filters": 16,
        "filter_size": 4,
        "hidden_dims": 64,
        "epochs": 10
    }