Example #1
0
 def test_one_layer(self):
     bi_lm = BiLM(token_num=101,
                  rnn_layer_num=1,
                  rnn_units=50,
                  rnn_type='gru')
     input_layer, output_layer = bi_lm.get_feature_layers()
     self.assertEqual((None, None), tuple(input_layer.shape))
     self.assertEqual((None, None, 100), tuple(output_layer.shape))
     model = keras.models.Model(inputs=input_layer, outputs=output_layer)
     model.summary()
Example #2
0
 def test_bidirectional(self):
     bi_lm = BiLM(token_num=102,
                  rnn_layer_num=1,
                  rnn_units=50,
                  rnn_type='gru',
                  use_bidirectional=True)
     input_layer, output_layer = bi_lm.get_feature_layers()
     self.assertEqual((None, None), input_layer._keras_shape)
     self.assertEqual((None, None, 200), output_layer._keras_shape)
     model = keras.models.Model(inputs=input_layer, outputs=output_layer)
     model.summary()
Example #3
0
 def test_weighted_sum(self):
     bi_lm = BiLM(token_num=107,
                  embedding_dim=108,
                  rnn_layer_num=6,
                  rnn_keep_num=7,
                  rnn_units=108,
                  rnn_type='lstm',
                  use_normalization=True)
     input_layer, output_layer = bi_lm.get_feature_layers(
         use_weighted_sum=True)
     model = keras.models.Model(inputs=input_layer, outputs=output_layer)
     model.summary()
Example #4
0
 def test_get_batch(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play'],
         ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
     ]
     token_dict = {
         'all': 3,
         'work': 4,
         'and': 5,
         'no': 6,
         'play': 7,
         'makes': 8,
         'a': 9,
         'dull': 10,
         'boy': 11,
         '.': 12,
     }
     inputs, outputs = BiLM.get_batch(sentences, token_dict, ignore_case=False)
     expect = [
         [1, 4, 5, 6, 7, 0],
         [8, 1, 9, 10, 11, 12],
     ]
     self.assertEqual(expect, inputs.tolist())
     expect = [
         [[4], [5], [6], [7], [2], [0]],
         [[1], [9], [10], [11], [12], [2]],
     ]
     self.assertEqual(expect, outputs[0].tolist())
     expect = [
         [[2], [1], [4], [5], [6], [0]],
         [[2], [8], [1], [9], [10], [11]],
     ]
     self.assertEqual(expect, outputs[1].tolist())
     inputs, outputs = BiLM.get_batch(sentences, token_dict, ignore_case=True)
     expect = [
         [3, 4, 5, 6, 7, 0],
         [8, 1, 9, 10, 11, 12],
     ]
     self.assertEqual(expect, inputs.tolist())
     expect = [
         [[4], [5], [6], [7], [2], [0]],
         [[1], [9], [10], [11], [12], [2]],
     ]
     self.assertEqual(expect, outputs[0].tolist())
     expect = [
         [[2], [3], [4], [5], [6], [0]],
         [[2], [8], [1], [9], [10], [11]],
     ]
     self.assertEqual(expect, outputs[1].tolist())
Example #5
0
 def test_input_layer(self):
     input_layer = keras.layers.Input((None, ), name='New-Input')
     bi_lm = BiLM(token_num=104,
                  rnn_layer_num=6,
                  rnn_keep_num=3,
                  rnn_units=50,
                  rnn_type='gru')
     output_layer = bi_lm.get_feature_layers(input_layer=input_layer)
     model = keras.models.Model(inputs=input_layer, outputs=output_layer)
     model.summary()
     for layer in bi_lm.model.layers:
         try:
             new_layer = model.get_layer(name=layer.name)
             self.assertEqual(layer.get_weights(), new_layer.get_weights())
         except ValueError:
             pass
def train_batch_generator(batch_size=32, training=True):
    batch_size //= 2
    while True:
        sentences = []
        if training:
            batch_pos = random.sample(train_pos_files, batch_size)
            batch_neg = random.sample(train_neg_files, batch_size)
        else:
            batch_pos = random.sample(val_pos_files, batch_size)
            batch_neg = random.sample(val_neg_files, batch_size)
        for file_name in batch_pos:
            with codecs.open(os.path.join(TRAIN_ROOT, 'pos', file_name), 'r',
                             'utf8') as reader:
                text = reader.read().strip()
                sentences.append(get_word_list_eng(text))
        for file_name in batch_neg:
            with codecs.open(os.path.join(TRAIN_ROOT, 'neg', file_name), 'r',
                             'utf8') as reader:
                text = reader.read().strip()
            sentences.append(get_word_list_eng(text))
        word_input, _ = BiLM.get_batch(
            sentences=sentences,
            token_dict=word_dict,
            ignore_case=True,
        )
        yield word_input, keras.utils.to_categorical([1] * batch_size +
                                                     [0] * batch_size)
Example #7
0
 def test_multiple_layers(self):
     bi_lm = BiLM(token_num=103,
                  rnn_layer_num=6,
                  rnn_keep_num=3,
                  rnn_units=50,
                  rnn_type='lstm')
     input_layer, output_layer = bi_lm.get_feature_layers()
     self.assertEqual((None, None), tuple(input_layer.shape))
     self.assertEqual((None, None, 300), tuple(output_layer.shape))
     model = keras.models.Model(inputs=input_layer, outputs=output_layer)
     model.summary()
     for layer in bi_lm.model.layers:
         try:
             new_layer = model.get_layer(name=layer.name)
             self.assertEqual(layer.get_weights(), new_layer.get_weights())
         except ValueError:
             pass
Example #8
0
 def test_no_embedding(self):
     input_layer = keras.layers.Input((None, 106), name='New-Input')
     bi_lm = BiLM(token_num=105,
                  has_embedding=False,
                  embedding_dim=106,
                  rnn_layer_num=6,
                  rnn_keep_num=1,
                  rnn_units=50,
                  rnn_type='lstm')
     output_layer = bi_lm.get_feature_layers(input_layer=input_layer)
     model = keras.models.Model(inputs=input_layer, outputs=output_layer)
     model.summary()
     for layer in bi_lm.model.layers:
         try:
             new_layer = model.get_layer(name=layer.name)
             self.assertEqual(layer.get_weights(), new_layer.get_weights())
         except ValueError:
             pass
def train_lm_generator(batch_size=32):
    while True:
        index = 0
        while index * batch_size < len(sentences):
            batch_sentences = sentences[index * batch_size:(index + 1) *
                                        batch_size]
            inputs, outputs = BiLM.get_batch(batch_sentences,
                                             token_dict=word_dict,
                                             ignore_case=True)
            yield inputs, outputs
Example #10
0
 def test_bidirectional_overfitting(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play'],
         ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
     ]
     token_dict = {
         '': 0,
         '<UNK>': 1,
         '<EOS>': 2,
         'all': 3,
         'work': 4,
         'and': 5,
         'no': 6,
         'play': 7,
         'makes': 8,
         'a': 9,
         'dull': 10,
         'boy': 11,
         '.': 12,
     }
     token_dict_rev = {v: k for k, v in token_dict.items()}
     inputs, outputs = BiLM.get_batch(sentences,
                                      token_dict,
                                      ignore_case=True,
                                      unk_index=token_dict['<UNK>'],
                                      eos_index=token_dict['<EOS>'])
     bi_lm = BiLM(token_num=len(token_dict),
                  embedding_dim=10,
                  rnn_units=10,
                  use_bidirectional=True)
     bi_lm.model.summary()
     bi_lm.fit(
         np.repeat(inputs, 2**12, axis=0),
         [
             np.repeat(outputs[0], 2**12, axis=0),
             np.repeat(outputs[1], 2**12, axis=0),
         ],
         epochs=5,
     )
     predict = bi_lm.predict(inputs)
     forward = predict[0].argmax(axis=-1)
     backward = predict[1].argmax(axis=-1)
     self.assertEqual(
         'work and no play <EOS>',
         ' '.join(map(lambda x: token_dict_rev[x],
                      forward[0].tolist()[:-1])).strip())
     self.assertEqual(
         '<UNK> a dull boy . <EOS>',
         ' '.join(map(lambda x: token_dict_rev[x],
                      forward[1].tolist())).strip())
     self.assertEqual(
         '<EOS> all work and no', ' '.join(
             map(lambda x: token_dict_rev[x],
                 backward[0].tolist()[:-1])).strip())
     self.assertEqual(
         '<EOS> makes <UNK> a dull boy',
         ' '.join(map(lambda x: token_dict_rev[x],
                      backward[1].tolist())).strip())
Example #11
0
def lm_batch_generator(sentences, steps):
    global word_dict, char_dict, max_word_len
    while True:
        for i in range(steps):
            batch_sentences = sentences[BATCH_SIZE *
                                        i:min(BATCH_SIZE *
                                              (i + 1), len(sentences))]
            inputs, outputs = BiLM.get_batch(
                sentences=batch_sentences,
                token_dict=word_dict,
                ignore_case=True,
                unk_index=word_dict['<UNK>'],
                eos_index=word_dict['<EOS>'],
            )
            yield inputs, outputs
def test_batch_generator(batch_size=32):
    batch_size //= 2
    index = 0
    while index < test_num:
        sentences = []
        batch_pos = test_pos_files[index:min(index + batch_size, test_num)]
        batch_neg = test_neg_files[index:min(index + batch_size, test_num)]
        index += batch_size
        for file_name in batch_pos:
            with codecs.open(os.path.join(TEST_ROOT, 'pos', file_name), 'r',
                             'utf8') as reader:
                text = reader.read().strip()
                sentences.append(get_word_list_eng(text))
        for file_name in batch_neg:
            with codecs.open(os.path.join(TEST_ROOT, 'neg', file_name), 'r',
                             'utf8') as reader:
                text = reader.read().strip()
            sentences.append(get_word_list_eng(text))
        word_input, _ = BiLM.get_batch(
            sentences=sentences,
            token_dict=word_dict,
            ignore_case=True,
        )
        yield word_input
Example #13
0
 def test_embedding_weights(self):
     bi_lm = BiLM(token_num=105,
                  rnn_layer_num=1,
                  embedding_dim=106,
                  embedding_weights=numpy.random.random((105, 106)))
     bi_lm.model.summary()
Example #14
0
 def test_bidirectional(self):
     bi_lm = BiLM(token_num=104, rnn_layer_num=1, use_bidirectional=True)
     bi_lm.model.summary()
Example #15
0
 def test_init_multi_keep(self):
     bi_lm = BiLM(token_num=103, rnn_layer_num=6, rnn_keep_num=3)
     bi_lm.model.summary()
# Training LM
def train_lm_generator(batch_size=32):
    while True:
        index = 0
        while index * batch_size < len(sentences):
            batch_sentences = sentences[index * batch_size:(index + 1) *
                                        batch_size]
            inputs, outputs = BiLM.get_batch(batch_sentences,
                                             token_dict=word_dict,
                                             ignore_case=True)
            yield inputs, outputs


print('Fit LM...')
if os.path.exists(LM_MODEL_PATH):
    bi_lm = BiLM(model_path=LM_MODEL_PATH)
else:
    bi_lm = BiLM(token_num=len(word_dict))
    bi_lm.model.fit_generator(
        generator=train_lm_generator(batch_size=batch_size),
        steps_per_epoch=len(sentences) // batch_size,
        epochs=epoch_num,
        verbose=True,
    )
    bi_lm.save_model(LM_MODEL_PATH)

# Build model for classification
input_layer, feature_layer = bi_lm.get_feature_layers()
lstm_layer = keras.layers.Bidirectional(
    keras.layers.LSTM(units=50),
    name='Bi-LSTM',
Example #17
0
 def test_init_load(self):
     model_path = os.path.join(self.tmp_path, 'save_load.h5')
     model = BiLM(token_num=101)
     model.save_model(model_path)
     BiLM(model_path=model_path)
Example #18
0
 def test_save_load(self):
     with tempfile.TemporaryDirectory() as temp_path:
         model_path = os.path.join(temp_path, 'save_load.h5')
         model = BiLM(token_num=101)
         model.save_model(model_path)
         model.load_model(model_path)
Example #19
0
 def test_no_embedding(self):
     bi_lm = BiLM(token_num=107,
                  rnn_layer_num=1,
                  embedding_dim=108,
                  has_embedding=False)
     bi_lm.model.summary()
Example #20
0
 def test_init_single(self):
     bi_lm = BiLM(token_num=101, rnn_layer_num=1, rnn_type='gru')
     bi_lm.model.summary()
Example #21
0
            parts = line.split()
            word = parts[0].lower()
            if word in word_dict:
                word_embd_weights[word_dict[word]] = parts[1:]
    word_embd_weights = numpy.asarray(word_embd_weights)
    print('Dict size: %d  Shape of weights: %s' %
          (len(word_dict), str(word_embd_weights.shape)))
else:
    word_embd_weights = None
    print('Dict size: %d' % len(word_dict))

train_steps = (len(train_sentences) + BATCH_SIZE - 1) // BATCH_SIZE
valid_steps = (len(valid_sentences) + BATCH_SIZE - 1) // BATCH_SIZE

if os.path.exists(MODEL_LM_PATH):
    bi_lm_model = BiLM(model_path=MODEL_LM_PATH)
else:
    bi_lm_model = BiLM(
        token_num=len(word_dict),
        rnn_units=100,
        embedding_weights=word_embd_weights,
        embedding_dim=100,
    )
    bi_lm_model.model.summary()

    def lm_batch_generator(sentences, steps):
        global word_dict, char_dict, max_word_len
        while True:
            for i in range(steps):
                batch_sentences = sentences[BATCH_SIZE *
                                            i:min(BATCH_SIZE *