Ejemplo n.º 1
0
    print "List of Activation Functions\n" \
          "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \
          "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear"
    choice = input('Enter type of activation function for ' + name + ': ')
    activations = [
        'softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh',
        'sigmoid', 'hard_sigmoid', 'linear'
    ]
    return activations[choice - 1]


"""
Preparing file
"""

train = DL('id-ud-train.pos')
test = DL('id-ud-dev.pos')
"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
WE_DIR = 'polyglot.vec'

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
for line in f:
    values = line.split()
    wrd = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
Ejemplo n.º 2
0
    x_char = []
    for sent in x_tmp2:
        padded_sent = sent
        pad = padsize - len(sent)
        for i in range(pad):
            padded_sent = np.vstack((zeroes, padded_sent))
        x_char.append(padded_sent)
    print('Padded until %s tokens.' % padsize)
    return x_char


"""
Preparing file
"""

train = DL('ner_3_train.ner')
test = DL('ner_3_test.ner')
# val = DL('id-ud-dev.pos')
# train.add('id-ud-dev.pos')
"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
WE_DIR = 'polyglot.vec'

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
for line in f:
    values = line.split()
Ejemplo n.º 3
0
def activationPrompt(name):
    print "List of Activation Functions\n" \
          "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \
          "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear"
    choice = input('Enter type of activation function for ' + name + ': ')
    activations = ['softmax', 'elu', 'selu', 'softplus', 'softsign',
                   'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
    return activations[choice - 1]


"""
Preparing file
"""
train_name = sys.argv[1]
train = DL(train_name)
# percentage = input('Enter percentage of data to take: ')
percentage = 0.9
seed = sys.argv[2]
# seed = input('Enter seed for slicing data: ')
train.slice(percentage, seed)
test_name = sys.argv[1]
test = DL(test_name)
test.antislice(percentage, seed)

"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
Ejemplo n.º 4
0
from DataProcessor import DataLoader as DL
from DataProcessor import DataMapper as DM
from keras import Model
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import Input
from keras.utils import plot_model
from keras.utils import to_categorical
from keras_contrib.layers import CRF
"""
Preparing file
"""

train = DL('id-ud-train')
test = DL('id-ud-test')
"""
Create Word & Label Index
"""

word = DI([train.words, test.words])
label = DI([train.labels
            ])  # training label and testing label should be the same

print 'Found', word.cnt - 1, 'unique words.'
print 'Found', label.cnt - 1, 'unique labels.'
"""
Load pre-trained embedding
"""
Ejemplo n.º 5
0
    def createModel(self, traindata, valdata, testdata, wordemb, charemb):
        self.train = DL(traindata)
        self.val = DL(valdata)
        self.test = DL(testdata)

        # Load pre-trained embedding
        embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb)
        char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader(
            charemb)

        # Create Word & Label Index
        self.char = DI(self.train.words + ce_words)
        self.word = DI([self.train.words, [we_words]])
        self.label = DI([self.train.labels])
        print 'Found', self.word.cnt - 1, 'unique words.'
        print 'Found', self.char.cnt - 1, 'unique chars.'
        print 'Found', self.label.cnt - 1, 'unique labels.'

        # Create word embedding matrix
        self.EMBEDDING_DIM = len(self.coefs)
        embedding_matrix = np.zeros(
            (len(self.word.index) + 1, int(self.EMBEDDING_DIM)))
        for wrd, i in self.word.index.items():
            embedding_vector = embeddings_index.get(wrd)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # Create char embedding matrix
        char_embedding_matrix = np.zeros(
            (len(self.char.index) + 1, int(self.EMBEDDING_DIM)))
        for chars, i in self.char.index.items():
            embedding_vector = char_embeddings_index.get(chars)
            if embedding_vector is not None:
                char_embedding_matrix[i] = embedding_vector

        trimlen = self.padsize
        self.train.trim(trimlen)
        self.test.trim(trimlen)
        self.val.trim(trimlen)

        self.x_train = DM(self.train.words, self.word.index)
        self.x_test = DM(self.test.words, self.word.index)
        self.x_val = DM(self.val.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov
        print "Number of OOV (val):", len(self.x_val.oov_index)
        print "OOV word occurences (val):", self.x_val.oov
        padsize = self.padsize
        self.x_train.pad(padsize)
        self.x_test.pad(padsize)
        self.x_val.pad(padsize)
        print('Padded until %s tokens.' % padsize)

        self.y_train = DM(self.train.labels, self.label.index)
        self.y_test = DM(self.test.labels, self.label.index)
        self.y_val = DM(self.val.labels, self.label.index)

        self.y_train.pad(padsize)
        self.y_test.pad(padsize)
        self.y_val.pad(padsize)
        self.y_encoded = to_categorical(self.y_train.padded)
        self.y_val_enc = to_categorical(self.y_val.padded)

        # Converting char text data to int using index
        self.x_test_char = self.convertCharText2Int(self.test)
        self.x_train_char = self.convertCharText2Int(self.train)
        self.x_val_char = self.convertCharText2Int(self.val)

        # Create keras word model
        MAX_SEQUENCE_LENGTH = self.padsize
        embedding_layer = Embedding(len(self.word.index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    mask_zero=self.mask)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedded_sequences = embedding_layer(sequence_input)
        drop = self.dropout_embedding
        dropout = Dropout(rate=drop)(embedded_sequences)

        # Create keras char model
        def reshape_one(c):
            return K.reshape(c, (tf.shape(c)[0] * self.padsize,
                                 self.char_padsize, self.CHAR_EMBEDDING_DIM))

        def reshape_two(c):
            if merge_m_c == 'concat':
                return K.reshape(c,
                                 (tf.shape(c)[0] / self.padsize, self.padsize,
                                  self.CHAR_EMBEDDING_DIM * 2))
            else:
                return K.reshape(c, (tf.shape(c)[0] / self.padsize,
                                     self.padsize, self.CHAR_EMBEDDING_DIM))

        MAX_WORD_LENGTH = self.char_padsize

        embedding_layer_c = Embedding(len(self.char.index) + 1,
                                      self.EMBEDDING_DIM,
                                      weights=[char_embedding_matrix],
                                      input_length=MAX_WORD_LENGTH,
                                      mask_zero=self.mask)

        sequence_input_c = Input(shape=(
            self.padsize,
            MAX_WORD_LENGTH,
        ),
                                 dtype='int32')
        embedded_sequences_c = embedding_layer_c(sequence_input_c)
        dropout_c = Dropout(rate=drop)(embedded_sequences_c)

        rone = Lambda(reshape_one)(dropout_c)
        merge_m = 'concat'
        merge_m_c = merge_m
        dropout_gru = self.dropout_gru
        rec_dropout = dropout_gru
        gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM,
                                         return_sequences=False,
                                         dropout=dropout_gru,
                                         recurrent_dropout=rec_dropout),
                                     merge_mode=merge_m,
                                     weights=None)(rone)

        rtwo = Lambda(reshape_two)(gru_karakter)

        # Combine word + char model
        merge_m = 'concat'
        merge = Concatenate()([dropout, rtwo])
        gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3,
                                     return_sequences=True,
                                     dropout=dropout_gru,
                                     recurrent_dropout=rec_dropout),
                                 merge_mode=merge_m,
                                 weights=None)(merge)

        crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata)
        self.model = Model(inputs=[sequence_input, sequence_input_c],
                           outputs=[crf])

        optimizer = self.optimizer
        loss = self.loss
        self.model.summary()
        self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
Ejemplo n.º 6
0
from DataProcessor import DataLoader as DL
from DataProcessor import DataMapper as DM
from keras import Model
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import Input
from keras.utils import plot_model
from keras.utils import to_categorical
from keras_contrib.layers import CRF
"""
Preparing file
"""

train = DL('id-ud-train.pos')
test = DL('id-ud-test.pos')
"""
Create Word & Label Index
"""

word = DI([train.words, test.words])
label = DI([train.labels
            ])  # training label and testing label should be the same

print 'Found', word.cnt - 1, 'unique words.'
print 'Found', label.cnt - 1, 'unique labels.'
"""
Load pre-trained embedding
"""
Ejemplo n.º 7
0
        padded_sent = sent
        pad = padsize - len(sent)
        for i in range(pad):
            padded_sent = np.vstack((zeroes, padded_sent))
        x_char.append(padded_sent)
    print('Padded until %s tokens.' % padsize)
    return x_char


"""
Preparing file
"""

percentage = 0.9
seed = sys.argv[1]
train = DL('ner_3_train.ner')
train.slice(percentage, seed)
test = DL('ner_3_train.ner')
test.antislice(percentage, seed)
val = DL('ner_3_train.ner')
val.antislice(percentage, seed)
# train.add('id-ud-dev.pos')

"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
WE_DIR = 'polyglot.vec'