def test_one_layer(self): bi_lm = BiLM(token_num=101, rnn_layer_num=1, rnn_units=50, rnn_type='gru') input_layer, output_layer = bi_lm.get_feature_layers() self.assertEqual((None, None), tuple(input_layer.shape)) self.assertEqual((None, None, 100), tuple(output_layer.shape)) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary()
def test_bidirectional(self): bi_lm = BiLM(token_num=102, rnn_layer_num=1, rnn_units=50, rnn_type='gru', use_bidirectional=True) input_layer, output_layer = bi_lm.get_feature_layers() self.assertEqual((None, None), input_layer._keras_shape) self.assertEqual((None, None, 200), output_layer._keras_shape) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary()
def test_weighted_sum(self): bi_lm = BiLM(token_num=107, embedding_dim=108, rnn_layer_num=6, rnn_keep_num=7, rnn_units=108, rnn_type='lstm', use_normalization=True) input_layer, output_layer = bi_lm.get_feature_layers( use_weighted_sum=True) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary()
def test_get_batch(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] token_dict = { 'all': 3, 'work': 4, 'and': 5, 'no': 6, 'play': 7, 'makes': 8, 'a': 9, 'dull': 10, 'boy': 11, '.': 12, } inputs, outputs = BiLM.get_batch(sentences, token_dict, ignore_case=False) expect = [ [1, 4, 5, 6, 7, 0], [8, 1, 9, 10, 11, 12], ] self.assertEqual(expect, inputs.tolist()) expect = [ [[4], [5], [6], [7], [2], [0]], [[1], [9], [10], [11], [12], [2]], ] self.assertEqual(expect, outputs[0].tolist()) expect = [ [[2], [1], [4], [5], [6], [0]], [[2], [8], [1], [9], [10], [11]], ] self.assertEqual(expect, outputs[1].tolist()) inputs, outputs = BiLM.get_batch(sentences, token_dict, ignore_case=True) expect = [ [3, 4, 5, 6, 7, 0], [8, 1, 9, 10, 11, 12], ] self.assertEqual(expect, inputs.tolist()) expect = [ [[4], [5], [6], [7], [2], [0]], [[1], [9], [10], [11], [12], [2]], ] self.assertEqual(expect, outputs[0].tolist()) expect = [ [[2], [3], [4], [5], [6], [0]], [[2], [8], [1], [9], [10], [11]], ] self.assertEqual(expect, outputs[1].tolist())
def test_input_layer(self): input_layer = keras.layers.Input((None, ), name='New-Input') bi_lm = BiLM(token_num=104, rnn_layer_num=6, rnn_keep_num=3, rnn_units=50, rnn_type='gru') output_layer = bi_lm.get_feature_layers(input_layer=input_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary() for layer in bi_lm.model.layers: try: new_layer = model.get_layer(name=layer.name) self.assertEqual(layer.get_weights(), new_layer.get_weights()) except ValueError: pass
def train_batch_generator(batch_size=32, training=True): batch_size //= 2 while True: sentences = [] if training: batch_pos = random.sample(train_pos_files, batch_size) batch_neg = random.sample(train_neg_files, batch_size) else: batch_pos = random.sample(val_pos_files, batch_size) batch_neg = random.sample(val_neg_files, batch_size) for file_name in batch_pos: with codecs.open(os.path.join(TRAIN_ROOT, 'pos', file_name), 'r', 'utf8') as reader: text = reader.read().strip() sentences.append(get_word_list_eng(text)) for file_name in batch_neg: with codecs.open(os.path.join(TRAIN_ROOT, 'neg', file_name), 'r', 'utf8') as reader: text = reader.read().strip() sentences.append(get_word_list_eng(text)) word_input, _ = BiLM.get_batch( sentences=sentences, token_dict=word_dict, ignore_case=True, ) yield word_input, keras.utils.to_categorical([1] * batch_size + [0] * batch_size)
def test_multiple_layers(self): bi_lm = BiLM(token_num=103, rnn_layer_num=6, rnn_keep_num=3, rnn_units=50, rnn_type='lstm') input_layer, output_layer = bi_lm.get_feature_layers() self.assertEqual((None, None), tuple(input_layer.shape)) self.assertEqual((None, None, 300), tuple(output_layer.shape)) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary() for layer in bi_lm.model.layers: try: new_layer = model.get_layer(name=layer.name) self.assertEqual(layer.get_weights(), new_layer.get_weights()) except ValueError: pass
def test_no_embedding(self): input_layer = keras.layers.Input((None, 106), name='New-Input') bi_lm = BiLM(token_num=105, has_embedding=False, embedding_dim=106, rnn_layer_num=6, rnn_keep_num=1, rnn_units=50, rnn_type='lstm') output_layer = bi_lm.get_feature_layers(input_layer=input_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary() for layer in bi_lm.model.layers: try: new_layer = model.get_layer(name=layer.name) self.assertEqual(layer.get_weights(), new_layer.get_weights()) except ValueError: pass
def train_lm_generator(batch_size=32): while True: index = 0 while index * batch_size < len(sentences): batch_sentences = sentences[index * batch_size:(index + 1) * batch_size] inputs, outputs = BiLM.get_batch(batch_sentences, token_dict=word_dict, ignore_case=True) yield inputs, outputs
def test_bidirectional_overfitting(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] token_dict = { '': 0, '<UNK>': 1, '<EOS>': 2, 'all': 3, 'work': 4, 'and': 5, 'no': 6, 'play': 7, 'makes': 8, 'a': 9, 'dull': 10, 'boy': 11, '.': 12, } token_dict_rev = {v: k for k, v in token_dict.items()} inputs, outputs = BiLM.get_batch(sentences, token_dict, ignore_case=True, unk_index=token_dict['<UNK>'], eos_index=token_dict['<EOS>']) bi_lm = BiLM(token_num=len(token_dict), embedding_dim=10, rnn_units=10, use_bidirectional=True) bi_lm.model.summary() bi_lm.fit( np.repeat(inputs, 2**12, axis=0), [ np.repeat(outputs[0], 2**12, axis=0), np.repeat(outputs[1], 2**12, axis=0), ], epochs=5, ) predict = bi_lm.predict(inputs) forward = predict[0].argmax(axis=-1) backward = predict[1].argmax(axis=-1) self.assertEqual( 'work and no play <EOS>', ' '.join(map(lambda x: token_dict_rev[x], forward[0].tolist()[:-1])).strip()) self.assertEqual( '<UNK> a dull boy . <EOS>', ' '.join(map(lambda x: token_dict_rev[x], forward[1].tolist())).strip()) self.assertEqual( '<EOS> all work and no', ' '.join( map(lambda x: token_dict_rev[x], backward[0].tolist()[:-1])).strip()) self.assertEqual( '<EOS> makes <UNK> a dull boy', ' '.join(map(lambda x: token_dict_rev[x], backward[1].tolist())).strip())
def lm_batch_generator(sentences, steps): global word_dict, char_dict, max_word_len while True: for i in range(steps): batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(sentences))] inputs, outputs = BiLM.get_batch( sentences=batch_sentences, token_dict=word_dict, ignore_case=True, unk_index=word_dict['<UNK>'], eos_index=word_dict['<EOS>'], ) yield inputs, outputs
def test_batch_generator(batch_size=32): batch_size //= 2 index = 0 while index < test_num: sentences = [] batch_pos = test_pos_files[index:min(index + batch_size, test_num)] batch_neg = test_neg_files[index:min(index + batch_size, test_num)] index += batch_size for file_name in batch_pos: with codecs.open(os.path.join(TEST_ROOT, 'pos', file_name), 'r', 'utf8') as reader: text = reader.read().strip() sentences.append(get_word_list_eng(text)) for file_name in batch_neg: with codecs.open(os.path.join(TEST_ROOT, 'neg', file_name), 'r', 'utf8') as reader: text = reader.read().strip() sentences.append(get_word_list_eng(text)) word_input, _ = BiLM.get_batch( sentences=sentences, token_dict=word_dict, ignore_case=True, ) yield word_input
def test_embedding_weights(self): bi_lm = BiLM(token_num=105, rnn_layer_num=1, embedding_dim=106, embedding_weights=numpy.random.random((105, 106))) bi_lm.model.summary()
def test_bidirectional(self): bi_lm = BiLM(token_num=104, rnn_layer_num=1, use_bidirectional=True) bi_lm.model.summary()
def test_init_multi_keep(self): bi_lm = BiLM(token_num=103, rnn_layer_num=6, rnn_keep_num=3) bi_lm.model.summary()
# Training LM def train_lm_generator(batch_size=32): while True: index = 0 while index * batch_size < len(sentences): batch_sentences = sentences[index * batch_size:(index + 1) * batch_size] inputs, outputs = BiLM.get_batch(batch_sentences, token_dict=word_dict, ignore_case=True) yield inputs, outputs print('Fit LM...') if os.path.exists(LM_MODEL_PATH): bi_lm = BiLM(model_path=LM_MODEL_PATH) else: bi_lm = BiLM(token_num=len(word_dict)) bi_lm.model.fit_generator( generator=train_lm_generator(batch_size=batch_size), steps_per_epoch=len(sentences) // batch_size, epochs=epoch_num, verbose=True, ) bi_lm.save_model(LM_MODEL_PATH) # Build model for classification input_layer, feature_layer = bi_lm.get_feature_layers() lstm_layer = keras.layers.Bidirectional( keras.layers.LSTM(units=50), name='Bi-LSTM',
def test_init_load(self): model_path = os.path.join(self.tmp_path, 'save_load.h5') model = BiLM(token_num=101) model.save_model(model_path) BiLM(model_path=model_path)
def test_save_load(self): with tempfile.TemporaryDirectory() as temp_path: model_path = os.path.join(temp_path, 'save_load.h5') model = BiLM(token_num=101) model.save_model(model_path) model.load_model(model_path)
def test_no_embedding(self): bi_lm = BiLM(token_num=107, rnn_layer_num=1, embedding_dim=108, has_embedding=False) bi_lm.model.summary()
def test_init_single(self): bi_lm = BiLM(token_num=101, rnn_layer_num=1, rnn_type='gru') bi_lm.model.summary()
parts = line.split() word = parts[0].lower() if word in word_dict: word_embd_weights[word_dict[word]] = parts[1:] word_embd_weights = numpy.asarray(word_embd_weights) print('Dict size: %d Shape of weights: %s' % (len(word_dict), str(word_embd_weights.shape))) else: word_embd_weights = None print('Dict size: %d' % len(word_dict)) train_steps = (len(train_sentences) + BATCH_SIZE - 1) // BATCH_SIZE valid_steps = (len(valid_sentences) + BATCH_SIZE - 1) // BATCH_SIZE if os.path.exists(MODEL_LM_PATH): bi_lm_model = BiLM(model_path=MODEL_LM_PATH) else: bi_lm_model = BiLM( token_num=len(word_dict), rnn_units=100, embedding_weights=word_embd_weights, embedding_dim=100, ) bi_lm_model.model.summary() def lm_batch_generator(sentences, steps): global word_dict, char_dict, max_word_len while True: for i in range(steps): batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE *