chars = values[0] coefs = np.asarray(values[1:], dtype='float32') char_embeddings_index[chars] = coefs f.close() print('Found %s char vectors.' % len(char_embeddings_index)) ce_words = [] for chr in char_embeddings_index: ce_words.append(chr) """ Create Word & Label Index """ char = DI(train.words + ce_words) char.save('char') word = DI([train.words, [we_words]]) word.save('word') label = DI([train.labels ]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', char.cnt - 1, 'unique chars.' print 'Found', label.cnt - 1, 'unique labels.' """ Create word embedding matrix """ EMBEDDING_DIM = len(coefs)
char_embeddings_index[chars] = coefs f.close() print('Found %s char vectors.' % len(char_embeddings_index)) ce_words = [] for chr in char_embeddings_index: ce_words.append(chr) """ Create Word & Label Index """ # char = DI(train.words + ce_words) char = DI() char.load('char') # word = DI([train.words, [we_words]]) word = DI() word.load('word') label = DI([train.labels]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', char.cnt - 1, 'unique chars.' print 'Found', label.cnt - 1, 'unique labels.' word.add([train.words]) print 'Found', word.cnt - 1, 'unique words.' """ Create word embedding matrix
class NERTagger: mask = True # mask pad (zeros) or not EMBEDDING_DIM = 64 CHAR_EMBEDDING_DIM = 64 padsize = 188 char_padsize = 25 def __init__(self): self.textinput = '' self.test = '' self.x_test = '' self.x_test_char = '' self.results = [] self.data = {} self.json_data = {} self.char = DI() self.char.load('char') self.word = DI() self.word.load('word.ner') self.label = DI() self.label.load('label.ner') print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.CHAR_EMBEDDING_DIM))) """ Create keras word model """ MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = 0.4 dropout = Dropout(rate=drop)(embedded_sequences) """ Create keras char model """ def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize # embeddingPrompt('char') embedding_layer_c = Embedding(len(self.char.index) + 1, self.CHAR_EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = 0.5 rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) """ Combine word + char model """ merge_m = 'concat' gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 2, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rtwo) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = 'adagrad' loss = 'poisson' self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc']) self.w_name = '06-05_17:19_658' m_layers_len = len(self.model.layers) for i in range(m_layers_len): with open(self.w_name + "_" + str(i) + ".wgt", "rb") as fp: w = pickle.load(fp) self.model.layers[i].set_weights(w) def predict(self, text): self.textinput = text self.test = DP(text) self.x_test = DM(self.test.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov self.x_test.pad(self.padsize) print('Padded until %s tokens.' % self.padsize) self.x_test_char = self.convertCharText2Int(self.test) self.results = [] print "Computing..." print self.x_test.padded print self.x_test_char raw_results = self.model.predict( [np.array(self.x_test.padded), np.array(self.x_test_char)]) for raw_result in raw_results: result = [] for token in raw_result: value = np.argmax(token) result.append(value) self.results.append(result) temp = self.results[0] li = self.label.index keys = li.keys() values = li.values() self.results = [] start = False for token in temp: if token != 0: start = True if start: if token == 0: self.results.append('?') else: self.results.append(keys[values.index(token)]) print self.test.words[0] print self.results self.data = {'words': self.test.words[0], 'labels': self.results} self.json_data = json.dumps(self.data) return self.json_data def log(self): self.textoutput = '' for token in self.results: self.textoutput = self.textoutput + token + ' ' rnow = datetime.now() logcsv = open('log.csv', 'a') writer = csv.writer(logcsv, delimiter=',') writer.writerow([ 'no', str(rnow.date()), str(rnow.time())[:-10], self.w_name, self.word.cnt - 1, self.char.cnt - 1, self.textinput, len(self.x_test.oov_index), self.textoutput ]) logcsv.close() def convertCharText2Int(self, dataload): x_tmp1 = [] for sent in dataload.words: x_map = DM(sent, self.char.index, False) if x_map.padsize > self.char_padsize: self.char_padsize = x_map.padsize x_tmp1.append(x_map) x_tmp2 = [] for sent in x_tmp1: sent.pad(self.char_padsize) x_tmp2.append(sent.padded) print('Padded until %s chars.' % self.char_padsize) zeroes = [] for i in range(self.char_padsize): zeroes.append(0) x_char = [] for sent in x_tmp2: padded_sent = sent pad = self.padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % self.padsize) return x_char
def __init__(self): self.textinput = '' self.test = '' self.x_test = '' self.x_test_char = '' self.results = [] self.data = {} self.json_data = {} self.char = DI() self.char.load('char') self.word = DI() self.word.load('word.ner') self.label = DI() self.label.load('label.ner') print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.CHAR_EMBEDDING_DIM))) """ Create keras word model """ MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = 0.4 dropout = Dropout(rate=drop)(embedded_sequences) """ Create keras char model """ def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize # embeddingPrompt('char') embedding_layer_c = Embedding(len(self.char.index) + 1, self.CHAR_EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = 0.5 rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) """ Combine word + char model """ merge_m = 'concat' gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 2, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rtwo) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = 'adagrad' loss = 'poisson' self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc']) self.w_name = '06-05_17:19_658' m_layers_len = len(self.model.layers) for i in range(m_layers_len): with open(self.w_name + "_" + str(i) + ".wgt", "rb") as fp: w = pickle.load(fp) self.model.layers[i].set_weights(w)
import pickle from DataProcessor import DataIndexer as DI w_name = '05-17_22:39_736' fp = open(w_name + "_1.wgt", "rb") fout = open(w_name + "-char.vec", "w") w = pickle.load(fp) w = w[0] idx = DI() idx.load('char') ii = idx.index keys = ii.keys() values = ii.values() for i, char in enumerate(w): if i != 0: if i < idx.cnt: print i c = keys[values.index(i)] try: c.decode('utf-8') fout.write(c) for vec in char: fout.write(' ' + str(vec)) fout.write('\n') except UnicodeError: print "char is not UTF-8"