def dump_token_embeddings(vocab_file, options_file, weight_file, outfile): with open(options_file, 'r') as fin: options = json.load(fin) options = tf.contrib.training.HParams(**options) max_word_length = options.char_cnn['max_characters_per_token'] vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) batcher = Batcher(vocab_file, max_word_length) ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = BidirectionalLanguageModel(options_file, weight_file) embedding_op = model(ids_placeholder)['token_embeddings'] n_tokens = vocab.size embed_dim = int(embedding_op.shape[2]) embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for k in range(n_tokens): token = vocab.id_to_word(k) char_ids = batcher.batch_sentences([[token] ])[0, 1, :].reshape(1, 1, -1) embeddings[k, :] = sess.run(embedding_op, feed_dict={ids_placeholder: char_ids}) with h5py.File(outfile, 'w') as fout: ds = fout.create_dataset('embedding', embeddings.shape, dtype='float32', data=embeddings)
def load_vocab(vocab_file, max_word_length=None): if max_word_length: return UnicodeCharsVocabulary(vocab_file, max_word_length, validate_file=True) else: return Vocabulary(vocab_file, validate_file=True)
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, outfile): with open(options_file, 'r') as fin: options = json.load(fin) options = tf.contrib.training.HParams(**options) max_word_length = options.char_cnn['max_characters_per_token'] vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) batcher = Batcher(vocab_file, max_word_length) ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = BidirectionalLanguageModel(options_file, weight_file) ops = model(ids_placeholder) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sentence_id = 0 with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout: for line in fin: sentence = line.strip().split() char_ids = batcher.batch_sentences([sentence]) embeddings = sess.run(ops['lm_embeddings'], feed_dict={ids_placeholder: char_ids}) ds = fout.create_dataset('{}'.format(sentence_id), embeddings.shape[1:], dtype='float32', data=embeddings[0, :, :, :]) sentence_id += 1
class Batcher(object): def __init__(self, lm_vocab_file: str, max_token_length: int): self._lm_vocab = UnicodeCharsVocabulary(lm_vocab_file, max_token_length) self._max_token_length = max_token_length def batch_sentences(self, sentences): n_sentences = len(sentences) max_length = max(len(sentence) for sentence in sentences) + 2 X_char_ids = np.zeros( (n_sentences, max_length, self._max_token_length), dtype=np.int64) for k, sent in enumerate(sentences): length = len(sent) + 2 char_ids_without_mask = self._lm_vocab.encode_chars(sent, split=False) X_char_ids[k, :length, :] = char_ids_without_mask + 1 return X_char_ids
def __init__(self, lm_vocab_file: str, max_token_length: int): self._lm_vocab = UnicodeCharsVocabulary(lm_vocab_file, max_token_length) self._max_token_length = max_token_length