print "List of Activation Functions\n" \ "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \ "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear" choice = input('Enter type of activation function for ' + name + ': ') activations = [ 'softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear' ] return activations[choice - 1] """ Preparing file """ train = DL('id-ud-train.pos') test = DL('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ') WE_DIR = 'polyglot.vec' print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r') for line in f: values = line.split() wrd = values[0] coefs = np.asarray(values[1:], dtype='float32')
x_char = [] for sent in x_tmp2: padded_sent = sent pad = padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % padsize) return x_char """ Preparing file """ train = DL('ner_3_train.ner') test = DL('ner_3_test.ner') # val = DL('id-ud-dev.pos') # train.add('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ') WE_DIR = 'polyglot.vec' print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r') for line in f: values = line.split()
def activationPrompt(name): print "List of Activation Functions\n" \ "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \ "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear" choice = input('Enter type of activation function for ' + name + ': ') activations = ['softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] return activations[choice - 1] """ Preparing file """ train_name = sys.argv[1] train = DL(train_name) # percentage = input('Enter percentage of data to take: ') percentage = 0.9 seed = sys.argv[2] # seed = input('Enter seed for slicing data: ') train.slice(percentage, seed) test_name = sys.argv[1] test = DL(test_name) test.antislice(percentage, seed) """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ')
from DataProcessor import DataLoader as DL from DataProcessor import DataMapper as DM from keras import Model from keras.layers import Bidirectional from keras.layers import Dense from keras.layers import Embedding from keras.layers import GRU from keras.layers import Input from keras.utils import plot_model from keras.utils import to_categorical from keras_contrib.layers import CRF """ Preparing file """ train = DL('id-ud-train') test = DL('id-ud-test') """ Create Word & Label Index """ word = DI([train.words, test.words]) label = DI([train.labels ]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', label.cnt - 1, 'unique labels.' """ Load pre-trained embedding """
def createModel(self, traindata, valdata, testdata, wordemb, charemb): self.train = DL(traindata) self.val = DL(valdata) self.test = DL(testdata) # Load pre-trained embedding embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb) char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader( charemb) # Create Word & Label Index self.char = DI(self.train.words + ce_words) self.word = DI([self.train.words, [we_words]]) self.label = DI([self.train.labels]) print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' # Create word embedding matrix self.EMBEDDING_DIM = len(self.coefs) embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) for wrd, i in self.word.index.items(): embedding_vector = embeddings_index.get(wrd) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Create char embedding matrix char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.EMBEDDING_DIM))) for chars, i in self.char.index.items(): embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector trimlen = self.padsize self.train.trim(trimlen) self.test.trim(trimlen) self.val.trim(trimlen) self.x_train = DM(self.train.words, self.word.index) self.x_test = DM(self.test.words, self.word.index) self.x_val = DM(self.val.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov print "Number of OOV (val):", len(self.x_val.oov_index) print "OOV word occurences (val):", self.x_val.oov padsize = self.padsize self.x_train.pad(padsize) self.x_test.pad(padsize) self.x_val.pad(padsize) print('Padded until %s tokens.' % padsize) self.y_train = DM(self.train.labels, self.label.index) self.y_test = DM(self.test.labels, self.label.index) self.y_val = DM(self.val.labels, self.label.index) self.y_train.pad(padsize) self.y_test.pad(padsize) self.y_val.pad(padsize) self.y_encoded = to_categorical(self.y_train.padded) self.y_val_enc = to_categorical(self.y_val.padded) # Converting char text data to int using index self.x_test_char = self.convertCharText2Int(self.test) self.x_train_char = self.convertCharText2Int(self.train) self.x_val_char = self.convertCharText2Int(self.val) # Create keras word model MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = self.dropout_embedding dropout = Dropout(rate=drop)(embedded_sequences) # Create keras char model def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize embedding_layer_c = Embedding(len(self.char.index) + 1, self.EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = self.dropout_gru rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) # Combine word + char model merge_m = 'concat' merge = Concatenate()([dropout, rtwo]) gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(merge) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = self.optimizer loss = self.loss self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
from DataProcessor import DataLoader as DL from DataProcessor import DataMapper as DM from keras import Model from keras.layers import Bidirectional from keras.layers import Dense from keras.layers import Embedding from keras.layers import GRU from keras.layers import Input from keras.utils import plot_model from keras.utils import to_categorical from keras_contrib.layers import CRF """ Preparing file """ train = DL('id-ud-train.pos') test = DL('id-ud-test.pos') """ Create Word & Label Index """ word = DI([train.words, test.words]) label = DI([train.labels ]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', label.cnt - 1, 'unique labels.' """ Load pre-trained embedding """
padded_sent = sent pad = padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % padsize) return x_char """ Preparing file """ percentage = 0.9 seed = sys.argv[1] train = DL('ner_3_train.ner') train.slice(percentage, seed) test = DL('ner_3_train.ner') test.antislice(percentage, seed) val = DL('ner_3_train.ner') val.antislice(percentage, seed) # train.add('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ') WE_DIR = 'polyglot.vec'