def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    options = tf.contrib.training.HParams(**options)
    max_word_length = options.char_cnn['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)
    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    embedding_op = model(ids_placeholder)['token_embeddings']
    n_tokens = vocab.size
    embed_dim = int(embedding_op.shape[2])

    embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for k in range(n_tokens):
            token = vocab.id_to_word(k)
            char_ids = batcher.batch_sentences([[token]
                                                ])[0, 1, :].reshape(1, 1, -1)
            embeddings[k, :] = sess.run(embedding_op,
                                        feed_dict={ids_placeholder: char_ids})

    with h5py.File(outfile, 'w') as fout:
        ds = fout.create_dataset('embedding',
                                 embeddings.shape,
                                 dtype='float32',
                                 data=embeddings)
예제 #2
0
파일: util.py 프로젝트: jxz542189/elmo_demo
def load_vocab(vocab_file, max_word_length=None):
    if max_word_length:
        return UnicodeCharsVocabulary(vocab_file,
                                      max_word_length,
                                      validate_file=True)
    else:
        return Vocabulary(vocab_file, validate_file=True)
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         outfile):
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    options = tf.contrib.training.HParams(**options)
    max_word_length = options.char_cnn['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)
    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                char_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embeddings.shape[1:],
                                         dtype='float32',
                                         data=embeddings[0, :, :, :])

                sentence_id += 1
예제 #4
0
class Batcher(object):
    def __init__(self, lm_vocab_file: str, max_token_length: int):
        self._lm_vocab = UnicodeCharsVocabulary(lm_vocab_file,
                                                max_token_length)
        self._max_token_length = max_token_length

    def batch_sentences(self, sentences):
        n_sentences = len(sentences)
        max_length = max(len(sentence) for sentence in sentences) + 2

        X_char_ids = np.zeros(
            (n_sentences, max_length, self._max_token_length), dtype=np.int64)

        for k, sent in enumerate(sentences):
            length = len(sent) + 2
            char_ids_without_mask = self._lm_vocab.encode_chars(sent,
                                                                split=False)
            X_char_ids[k, :length, :] = char_ids_without_mask + 1

        return X_char_ids
예제 #5
0
 def __init__(self, lm_vocab_file: str, max_token_length: int):
     self._lm_vocab = UnicodeCharsVocabulary(lm_vocab_file,
                                             max_token_length)
     self._max_token_length = max_token_length