def setUp(self): words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>'] (_, tmp) = tempfile.mkstemp() with open(tmp, 'w') as fout: fout.write('\n'.join(words)) self.vocab = UnicodeCharsVocabulary(tmp, 5) self._tmp = tmp
def test_vocab_encode_chars_reverse(self): sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't'])) vocab = UnicodeCharsVocabulary(self._tmp, 5) char_ids = vocab.encode_chars(sentence, reverse=True) expected = np.array( [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260], [258, 116, 104, 104, 259], [258, 196, 128, 116, 259], [258, 257, 259, 260, 260]], dtype=np.int32)[::-1, :] self.assertTrue((char_ids == expected).all())
class TestUnicodeCharsVocabulary(unittest.TestCase): def setUp(self): words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>'] (_, tmp) = tempfile.mkstemp() with open(tmp, 'w') as fout: fout.write('\n'.join(words)) self.vocab = UnicodeCharsVocabulary(tmp, 5) self._tmp = tmp def test_vocab_word_to_char_ids(self): char_ids = self.vocab.word_to_char_ids('th') expected = np.array([258, 116, 104, 259, 260], dtype=np.int32) self.assertTrue((char_ids == expected).all()) char_ids = self.vocab.word_to_char_ids('thhhhh') expected = np.array([258, 116, 104, 104, 259]) self.assertTrue((char_ids == expected).all()) char_ids = self.vocab.word_to_char_ids(chr(256) + 't') expected = np.array([258, 196, 128, 116, 259], dtype=np.int32) self.assertTrue((char_ids == expected).all()) def test_bos_eos(self): bos_ids = self.vocab.word_to_char_ids('<S>') self.assertTrue((bos_ids == self.vocab.bos_chars).all()) bos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('<S>')] self.assertTrue((bos_ids == self.vocab.bos_chars).all()) eos_ids = self.vocab.word_to_char_ids('</S>') self.assertTrue((eos_ids == self.vocab.eos_chars).all()) eos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('</S>')] self.assertTrue((eos_ids == self.vocab.eos_chars).all()) def test_vocab_encode_chars(self): sentence = ' '.join(['th', 'thhhhh', chr(256) + 't']) char_ids = self.vocab.encode_chars(sentence) expected = np.array( [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260], [258, 116, 104, 104, 259], [258, 196, 128, 116, 259], [258, 257, 259, 260, 260]], dtype=np.int32) self.assertTrue((char_ids == expected).all()) def test_vocab_encode_chars_reverse(self): sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't'])) vocab = UnicodeCharsVocabulary(self._tmp, 5) char_ids = vocab.encode_chars(sentence, reverse=True) expected = np.array( [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260], [258, 116, 104, 104, 259], [258, 196, 128, 116, 259], [258, 257, 259, 260, 260]], dtype=np.int32)[::-1, :] self.assertTrue((char_ids == expected).all()) def tearDown(self): os.remove(self._tmp)
def load_vocab(vocab_file, max_word_length=None): if max_word_length: return UnicodeCharsVocabulary(vocab_file, max_word_length, validate_file=True) else: return Vocabulary(vocab_file, validate_file=True)
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, outfile): with open(options_file, 'r') as fin: options = json.load(fin) max_word_length = options['char_cnn']['max_characters_per_token'] vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) batcher = Batcher(vocab_file, max_word_length) ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = BidirectionalLanguageModel(options_file, weight_file) ops = model(ids_placeholder) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sentence_id = 0 with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout: for line in fin: sentence = line.strip().split() char_ids = batcher.batch_sentences([sentence]) embeddings = sess.run(ops['lm_embeddings'], feed_dict={ids_placeholder: char_ids}) ds = fout.create_dataset('{}'.format(sentence_id), embeddings.shape[1:], dtype='float32', data=embeddings[0, :, :, :]) sentence_id += 1
def _load_data(self, reverse, chars, bidirectional=False): if chars: vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5) else: vocab = Vocabulary(self._tmp_vocab) if not bidirectional: data = LMDataset(self._tmp_train, vocab, reverse=reverse) else: data = BidirectionalLMDataset(self._tmp_train, vocab) return data
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile): ''' Given an input vocabulary file, dump all the token embeddings to the outfile. The result can be used as the embedding_weight_file when constructing a BidirectionalLanguageModel. ''' with open(options_file, 'r') as fin: options = json.load(fin) max_word_length = options['char_cnn']['max_characters_per_token'] vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) batcher = Batcher(vocab_file, max_word_length) ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = BidirectionalLanguageModel(options_file, weight_file) embedding_op = model(ids_placeholder)['token_embeddings'] n_tokens = vocab.size embed_dim = int(embedding_op.shape[2]) embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for k in range(n_tokens): token = vocab.id_to_word(k) #todo 获取具体的单词 char_ids = batcher.batch_sentences([[token] ])[0, 1, :].reshape(1, 1, -1) embeddings[k, :] = sess.run(embedding_op, feed_dict={ids_placeholder: char_ids}) with h5py.File(outfile, 'w') as fout: ds = fout.create_dataset('embedding', embeddings.shape, dtype='float32', data=embeddings)