Esempio n. 1
0
def convertCharText2Int(dataload):
    x_tmp1 = []
    global char_padsize
    for sent in dataload.words:
        x_map = DM(sent, char.index, False)
        if x_map.padsize > char_padsize:
            char_padsize = x_map.padsize
        x_tmp1.append(x_map)

    x_tmp2 = []
    for sent in x_tmp1:
        sent.pad(char_padsize)
        x_tmp2.append(sent.padded)
    print('Padded until %s chars.' % char_padsize)
    zeroes = []
    for i in range(char_padsize):
        zeroes.append(0)
    x_char = []
    for sent in x_tmp2:
        padded_sent = sent
        pad = padsize - len(sent)
        for i in range(pad):
            padded_sent = np.vstack((zeroes, padded_sent))
        x_char.append(padded_sent)
    print('Padded until %s tokens.' % padsize)
    return x_char
Esempio n. 2
0
    def predict(self, text):
        self.textinput = text
        self.test = DP(text)
        self.x_test = DM(self.test.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov

        self.x_test.pad(self.padsize)
        print('Padded until %s tokens.' % self.padsize)

        self.x_test_char = self.convertCharText2Int(self.test)

        self.results = []
        print "Computing..."
        print self.x_test.padded
        print self.x_test_char
        raw_results = self.model.predict(
            [np.array(self.x_test.padded),
             np.array(self.x_test_char)])
        for raw_result in raw_results:
            result = []
            for token in raw_result:
                value = np.argmax(token)
                result.append(value)
            self.results.append(result)

        temp = self.results[0]
        li = self.label.index
        keys = li.keys()
        values = li.values()
        self.results = []
        start = False
        for token in temp:
            if token != 0:
                start = True
            if start:
                if token == 0:
                    self.results.append('?')
                else:
                    self.results.append(keys[values.index(token)])

        print self.test.words[0]
        print self.results

        self.data = {'words': self.test.words[0], 'labels': self.results}
        self.json_data = json.dumps(self.data)
        return self.json_data
Esempio n. 3
0
    embedding_vector = char_embeddings_index.get(chars)
    if embedding_vector is not None:
        char_embedding_matrix[i] = embedding_vector
    else:
        char_notfound.append(chars)

print('%s unique chars not found in embedding.' % len(char_notfound))
"""
Converting word text data to int using index
"""
trimlen = input('Enter trimming length (default 63.5): ')
train.trim(trimlen)
test.trim(trimlen)
# val.trim(trimlen)

x_train = DM(train.words, word.index)
x_test = DM(test.words, word.index)
# x_val = DM(val.words, word.index)
print "Number of OOV:", len(x_test.oov_index)
print "OOV word occurences:", x_test.oov
# print "Number of OOV (val):", len(x_val.oov_index)
# print "OOV word occurences (val):", x_val.oov
padsize = max([x_train.padsize, x_test.padsize])
x_train.pad(padsize)
x_test.pad(padsize)
# x_val.pad(padsize)
print('Padded until %s tokens.' % padsize)

y_train = DM(train.labels, label.index)
y_test = DM(test.labels, label.index)
# y_val = DM(val.labels, label.index)
Esempio n. 4
0
char_notfound = []  # list kata yang tidak terdapat dalam embedding
char_embedding_matrix = np.zeros(
    (len(char.index) + 1, int(CHAR_EMBEDDING_DIM)))
for chars, i in char.index.items():
    embedding_vector = char_embeddings_index.get(chars)
    if embedding_vector is not None:
        char_embedding_matrix[i] = embedding_vector
    else:
        char_notfound.append(chars)

print('%s unique chars not found in embedding.' % len(char_notfound))
"""
Converting word text data to int using index
"""

x_train = DM(train.words, word.index)
x_test = DM(test.words, word.index)

padsize = max([x_train.padsize, x_test.padsize])
x_train.pad(padsize)
print('Padded until %s tokens.' % padsize)

y_train = DM(train.labels, label.index)
y_test = DM(test.labels, label.index)

y_train.pad(padsize)
y_encoded = to_categorical(y_train.padded)
"""
Converting char text data to int using index
"""
Esempio n. 5
0
    def createModel(self, traindata, valdata, testdata, wordemb, charemb):
        self.train = DL(traindata)
        self.val = DL(valdata)
        self.test = DL(testdata)

        # Load pre-trained embedding
        embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb)
        char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader(
            charemb)

        # Create Word & Label Index
        self.char = DI(self.train.words + ce_words)
        self.word = DI([self.train.words, [we_words]])
        self.label = DI([self.train.labels])
        print 'Found', self.word.cnt - 1, 'unique words.'
        print 'Found', self.char.cnt - 1, 'unique chars.'
        print 'Found', self.label.cnt - 1, 'unique labels.'

        # Create word embedding matrix
        self.EMBEDDING_DIM = len(self.coefs)
        embedding_matrix = np.zeros(
            (len(self.word.index) + 1, int(self.EMBEDDING_DIM)))
        for wrd, i in self.word.index.items():
            embedding_vector = embeddings_index.get(wrd)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # Create char embedding matrix
        char_embedding_matrix = np.zeros(
            (len(self.char.index) + 1, int(self.EMBEDDING_DIM)))
        for chars, i in self.char.index.items():
            embedding_vector = char_embeddings_index.get(chars)
            if embedding_vector is not None:
                char_embedding_matrix[i] = embedding_vector

        trimlen = self.padsize
        self.train.trim(trimlen)
        self.test.trim(trimlen)
        self.val.trim(trimlen)

        self.x_train = DM(self.train.words, self.word.index)
        self.x_test = DM(self.test.words, self.word.index)
        self.x_val = DM(self.val.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov
        print "Number of OOV (val):", len(self.x_val.oov_index)
        print "OOV word occurences (val):", self.x_val.oov
        padsize = self.padsize
        self.x_train.pad(padsize)
        self.x_test.pad(padsize)
        self.x_val.pad(padsize)
        print('Padded until %s tokens.' % padsize)

        self.y_train = DM(self.train.labels, self.label.index)
        self.y_test = DM(self.test.labels, self.label.index)
        self.y_val = DM(self.val.labels, self.label.index)

        self.y_train.pad(padsize)
        self.y_test.pad(padsize)
        self.y_val.pad(padsize)
        self.y_encoded = to_categorical(self.y_train.padded)
        self.y_val_enc = to_categorical(self.y_val.padded)

        # Converting char text data to int using index
        self.x_test_char = self.convertCharText2Int(self.test)
        self.x_train_char = self.convertCharText2Int(self.train)
        self.x_val_char = self.convertCharText2Int(self.val)

        # Create keras word model
        MAX_SEQUENCE_LENGTH = self.padsize
        embedding_layer = Embedding(len(self.word.index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    mask_zero=self.mask)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedded_sequences = embedding_layer(sequence_input)
        drop = self.dropout_embedding
        dropout = Dropout(rate=drop)(embedded_sequences)

        # Create keras char model
        def reshape_one(c):
            return K.reshape(c, (tf.shape(c)[0] * self.padsize,
                                 self.char_padsize, self.CHAR_EMBEDDING_DIM))

        def reshape_two(c):
            if merge_m_c == 'concat':
                return K.reshape(c,
                                 (tf.shape(c)[0] / self.padsize, self.padsize,
                                  self.CHAR_EMBEDDING_DIM * 2))
            else:
                return K.reshape(c, (tf.shape(c)[0] / self.padsize,
                                     self.padsize, self.CHAR_EMBEDDING_DIM))

        MAX_WORD_LENGTH = self.char_padsize

        embedding_layer_c = Embedding(len(self.char.index) + 1,
                                      self.EMBEDDING_DIM,
                                      weights=[char_embedding_matrix],
                                      input_length=MAX_WORD_LENGTH,
                                      mask_zero=self.mask)

        sequence_input_c = Input(shape=(
            self.padsize,
            MAX_WORD_LENGTH,
        ),
                                 dtype='int32')
        embedded_sequences_c = embedding_layer_c(sequence_input_c)
        dropout_c = Dropout(rate=drop)(embedded_sequences_c)

        rone = Lambda(reshape_one)(dropout_c)
        merge_m = 'concat'
        merge_m_c = merge_m
        dropout_gru = self.dropout_gru
        rec_dropout = dropout_gru
        gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM,
                                         return_sequences=False,
                                         dropout=dropout_gru,
                                         recurrent_dropout=rec_dropout),
                                     merge_mode=merge_m,
                                     weights=None)(rone)

        rtwo = Lambda(reshape_two)(gru_karakter)

        # Combine word + char model
        merge_m = 'concat'
        merge = Concatenate()([dropout, rtwo])
        gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3,
                                     return_sequences=True,
                                     dropout=dropout_gru,
                                     recurrent_dropout=rec_dropout),
                                 merge_mode=merge_m,
                                 weights=None)(merge)

        crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata)
        self.model = Model(inputs=[sequence_input, sequence_input_c],
                           outputs=[crf])

        optimizer = self.optimizer
        loss = self.loss
        self.model.summary()
        self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
Esempio n. 6
0
    embedding_vector = char_embeddings_index.get(chars)
    if embedding_vector is not None:
        char_embedding_matrix[i] = embedding_vector
    else:
        char_notfound.append(chars)

print('%s unique chars not found in embedding.' % len(char_notfound))
"""
Converting text data to int using index
"""
padsize = 188

x_test_tmp1 = []
char_padsize = 0
for sent in train.words:
    x_map = DM(sent, char.index)
    if x_map.padsize > char_padsize:
        char_padsize = x_map.padsize
    x_test_tmp1.append(x_map)

# x_test = DM(test.words[0], char.index)
# char_padsize = max([x_train.char_padsize, x_test.char_padsize])

x_test_tmp2 = []
for sent in x_test_tmp1:
    sent.pad(char_padsize)
    x_test_tmp2.append(sent.padded)

print('Padded until %s chars.' % char_padsize)

zeroes = []