Example #1
0
def main():
    args = _parse_args()
    # vocab = set(['<S>', '</S>'])
    # braces = re.compile('\(.*\)')
    # preproc = []
    # max_len = 0
    # with open(args.input, 'r') as src:
    #     for line in src:
    #         try:
    #             docid, text = line.split('\t', 1)
    #             text = re.sub(braces, '', text) or text
    #             sentence = sent_tokenize(text)[0]
    #             tokens = word_tokenize(sentence)
    #             max_len = max(len(tokens), max_len)
    #             preproc.append(docid + '\t' + ' '.join(tokens[:20]) + '\n')
    #             for token in tokens:
    #                 vocab.add(token)
    #         except IndexError:
    #             pass
    # print(max_len)
    # with open(args.input + '_prep', 'w+') as dst:
    #     dst.writelines(preproc)
    # with open(args.output, 'w+') as bilm_handle:
    #     bilm_handle.write('\n'.join(vocab))

    options_file = "/Users/asntr/Projects/university/course_work/end2end_neural_el/data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_options.json"
    weight_file = "/Users/asntr/Projects/university/course_work/end2end_neural_el/data/basic_data/elmo/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
    token_embedding_file = "/Users/asntr/Projects/university/course_work/end2end_neural_el/data/vocabulary/" + 'embeddings.hdf5'
    dump_token_embeddings(args.output, options_file, weight_file,
                          token_embedding_file)
Example #2
0
 def _genElmoEmbedding(self):
     """
     调用ELMO源码中的dump_token_embeddings方法,基于字符的表示生成词的向量表示。并保存成hdf5文件,文件中的"embedding"键对应的value就是
     词汇表文件中各词汇的向量表示,这些词汇的向量表示之后会作为BiLM的初始化输入。
     """
     dump_token_embeddings(self._vocabFile, self._optionFile,
                           self._weightFile, self._tokenEmbeddingFile)
    def __init__(self):
        self.vocab_file = 'vocab_small.txt'
        # Location of pretrained LM.  Here we use the test fixtures.
        datadir = os.path.join('pretrained')
        options_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
        weight_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

        # Dump the token embeddings to a file. Run this once for your dataset.
        token_embedding_file = 'elmo_token_embeddings.hdf5'
        dump_token_embeddings(self.vocab_file, options_file, weight_file,
                              token_embedding_file)

        self.batcher = TokenBatcher(self.vocab_file)
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file)
        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                context_embeddings_op,
                                                l2_coef=0.0)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)
Example #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--vocab', help='', required=True)
    parser.add_argument('--weight', help='', required=True)
    args = parser.parse_args()

    # Dump the token embeddings to a file. Run this once for your dataset.
    options_file = filename_variation(args.weight,
                                      'options').replace('.hdf5', '.json')
    token_embedding_file = filename_variation(args.weight, 'token_embedding')

    print(f'output file: {token_embedding_file}')

    dump_token_embeddings(args.vocab, options_file, args.weight,
                          token_embedding_file)
Example #5
0
import os
import h5py
from bilm import TokenBatcher, BidirectionalLanguageModel, weight_layers, dump_token_embeddings

vocab_file = 'vocab.txt'

# Location of pretrained LM.  Here we use the test fixtures.
datadir = 'kaggle_data'
vocab_file = os.path.join(datadir, 'vocab.txt')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir,
                           'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')

# Dump the embeddings to a file. Run this once for your dataset.
token_embedding_file = 'kaggle_elmo_token_SMALL.hdf5'
dump_token_embeddings(vocab_file, options_file, weight_file,
                      token_embedding_file)

import tensorflow as tf

tf.reset_default_graph()

import data as dt

#split trainset and validationset
alen = len(dt.X)
val_ratio = 0.1
val_len = int(alen * val_ratio)

tokenized_sentences = dt.X[:-val_len]
y = dt.y[:-val_len]
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input-file')
parser.add_argument('--weights-file')
parser.add_argument('--options-file')
parser.add_argument('-o',
                    '--output-file',
                    default='elmo_token_embeddings.hdf5')

args = parser.parse_args()

vocab_file = 'elmo_vocab.txt'
with open(args.input_file, 'r') as fin:
    with open(vocab_file, 'w') as fout:
        for line in fin:
            token = line.strip()
            if token == '[UNK]':
                token = '<UNK>'
            elif token == '[START]':
                token = '<S>'
            elif token == '[STOP]':
                token = '</S>'
            elif token == '[PAD]':
                token = '<PAD>'

            fout.write(token)
            fout.write('\n')

dump_token_embeddings(vocab_file, args.options_file, args.weights_file,
                      args.output_file)
Example #7
0
def main():
    parser = ArgumentParser()

    parser.add_argument('--options-file',
                        '-o',
                        type=str,
                        default="",
                        help="elmo option file")
    parser.add_argument('--weight-file',
                        '-w',
                        type=str,
                        default="",
                        help="elmo weight file")
    parser.add_argument('--train-file',
                        '-t',
                        type=str,
                        default="",
                        help="training data")
    parser.add_argument('--dev-file',
                        '-d',
                        type=str,
                        default="",
                        help="dev data")
    parser.add_argument('--gpu', '-g', type=int, default="-1", help="gpu")
    parser.add_argument('--vocab-file',
                        '-v',
                        type=str,
                        default="",
                        help="vocab file")
    parser.add_argument('--token-embedding-file',
                        '-e',
                        type=str,
                        default="",
                        help="embedding file")

    args = parser.parse_args()

    # -o ../../../test_elmo/src/elmo-chainer/elmo_2x4096_512_2048cnn_2xhighway_options.json -w ../../../test_elmo/src/elmo-chainer/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5 -t ../../data/datasets/SQuAD-train-v1.1-processed-spacy.txt -d ../../data/datasets/SQuAD-dev-v1.1-processed-spacy.txt -g=5 -v ../../data/embeddings/elmo/vocab_squad_1_1.txt -e ../../data/embeddings/elmo/token_embedding_squad_1_1.hdf5

    all_tokens = ['<S>', '</S>']

    with open(args.train_file) as f:

        json_list = [json.loads(line) for line in f]

        pbar = ProgressBar()

        for json_item in pbar(json_list):

            for token in json_item["document"]:
                if token not in all_tokens:
                    all_tokens.append(token)

            for token in json_item["question"]:
                if token not in all_tokens:
                    all_tokens.append(token)

    with open(args.dev_file) as f:

        json_list = [json.loads(line) for line in f]

        pbar = ProgressBar()

        for json_item in pbar(json_list):

            for token in json_item["document"]:
                if token not in all_tokens:
                    all_tokens.append(token)

            for token in json_item["question"]:
                if token not in all_tokens:
                    all_tokens.append(token)

    # vocab_file = 'vocab_squad1_1.txt'

    with open(args.vocab_file, 'w') as fout:
        fout.write('\n'.join(all_tokens))

    # Location of pretrained LM.
    # options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json'
    # weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

    # Dump the token embeddings to a file. Run this once for your dataset.
    # token_embedding_file = 'elmo_token_embeddings_squad1_1.hdf5'

    # gpu id
    # if you want to use cpu, set gpu=-1
    # gpu = -1
    # batchsize
    # encoding each token is inefficient
    # encoding too many tokens is difficult due to memory
    batchsize = 64

    dump_token_embeddings(args.vocab_file,
                          args.options_file,
                          args.weight_file,
                          args.token_embedding_file,
                          gpu=args.gpu,
                          batchsize=batchsize)
Example #8
0
 def getElmoEmbedding(self):
     dump_token_embeddings(self.vocab_file,self.option_file,self.weight_file,self.tokenEmbeddingFile)
Example #9
0
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

# Dump the token embeddings to a file. Run this once for your dataset.
token_embedding_file = 'elmo_token_embeddings.hdf5'

# gpu id
# if you want to use cpu, set gpu=-1
gpu = -1
# batchsize
# encoding each token is inefficient
# encoding too many tokens is difficult due to memory
batchsize = 64

dump_token_embeddings(vocab_file,
                      options_file,
                      weight_file,
                      token_embedding_file,
                      gpu=gpu,
                      batchsize=batchsize)

###########################################
"""
Differences from usage of character-elmo are only simple two points:
1. use TokenBatcher(vocab_file) instead of Batcher(vocab_file)
2. add token_embedding_file and token batcher for Elmo instantiation
"""

# Create a TokenBatcher to map text to token ids.
batcher = TokenBatcher(vocab_file)  # REQUIRED

# Build the Elmo with biLM and weight layers.
elmo = Elmo(
Example #10
0
def get_vocab(config):
    print("Get the vocabulary...")
    word_counter, char_counter = Counter(), Counter()
    pos_counter, ner_counter, label_counter = Counter(), Counter(), Counter()
    files = [(config.train_para_file, config.train_question_file),
             (config.dev_para_file, config.dev_question_file)]
    for para_file, que_file in files:
        with open("{}.tok".format(para_file), 'r') as fp, open("{}.tok".format(que_file), 'r') as fq, \
                open("{}.pos".format(para_file), 'r') as fpp, open("{}.pos".format(que_file), 'r') as fqp, \
                open("{}.ner".format(para_file), 'r') as fpn, open("{}.ner".format(que_file), 'r') as fqn, \
                open("{}.label".format(para_file), 'r') as fpl:
            while True:
                para, question = fp.readline(), fq.readline()
                pos, que_pos = fpp.readline(), fqp.readline()
                ner, que_ner = fpn.readline(), fqn.readline()
                label = fpl.readline()
                if not question or not para:
                    break
                if config.lower_word:
                    para = para.lower()
                    question = question.lower()
                para_tokens = para.strip().split(' ')
                que_tokens = question.strip().split(' ')
                pos_tags = pos.strip().split(' ')
                ner_tags = ner.strip().split(' ')
                que_pos_tags = que_pos.strip().split(' ')
                que_ner_tags = que_ner.strip().split(' ')
                labels = label.strip().split(' ')
                for token in para_tokens + que_tokens:
                    word_counter[token] += 1
                    for char in list(token):
                        char_counter[char] += 1
                for pos_tag in pos_tags + que_pos_tags:
                    pos_counter[pos_tag] += 1
                for ner_tag in ner_tags + que_ner_tags:
                    ner_counter[ner_tag] += 1
                for label in labels:
                    label_counter[label] += 1
    word_emb_mat, word2idx_dict, unk_num = get_word_embedding(
        word_counter,
        emb_file=config.glove_word_file,
        emb_size=config.glove_word_size,
        vocab_size=config.vocab_size_limit,
        vec_size=config.glove_dim,
        vocab_file=config.vocab_file)
    char_emb_mat, char2idx_dict = get_tag_embedding(char_counter,
                                                    "char",
                                                    vec_size=config.char_dim)
    pos_emb_mat, pos2idx_dict = get_tag_embedding(pos_counter,
                                                  "pos",
                                                  vec_size=config.pos_dim)
    ner_emb_mat, ner2idx_dict = get_tag_embedding(ner_counter,
                                                  "ner",
                                                  vec_size=config.ner_dim)
    label_emb_mat, label2idx_dict = get_tag_embedding(
        label_counter, "label", vec_size=config.label_dim)
    print("{} out of {} are not in glove".format(unk_num, len(word2idx_dict)))
    print("{} chars".format(char_emb_mat.shape[0]))
    print("{} pos tags, {} ner tags, {} answer labels, {} chars".format(
        pos_emb_mat.shape[0], ner_emb_mat.shape[0], label_emb_mat.shape[0],
        char_emb_mat.shape[0]))
    save(config.word_emb_file, word_emb_mat, message="word embedding")
    save(config.char_emb_file, char_emb_mat, message="char embedding")
    save(config.pos_emb_file, pos_emb_mat, message="pos embedding")
    save(config.ner_emb_file, ner_emb_mat, message="ner embedding")
    save(config.label_emb_file, label_emb_mat, message="label embedding")
    save(config.word_dictionary, word2idx_dict, message="word dictionary")
    save(config.char_dictionary, char2idx_dict, message="char dictionary")
    save(config.pos_dictionary, pos2idx_dict, message="pos dictionary")
    save(config.ner_dictionary, ner2idx_dict, message="ner dictionary")
    save(config.label_dictionary, label2idx_dict, message="label dictionary")
    print("Dump elmo word embedding...")
    token_embedding_file = config.embedding_file
    dump_token_embeddings(config.vocab_file, config.elmo_options_file,
                          config.elmo_weight_file, token_embedding_file)
Example #11
0
    def _make_dump_token_embeddings(self):

        dump_token_embeddings(self.vocab_path, self.elmo_options_file,
                              self.elmo_weight_file, self.token_embedding_file)
Example #12
0
options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

# Dump the token embeddings to a file. Run this once for your dataset.
token_embedding_file = 'elmo_token_embeddings.hdf5'

# gpu id
# if you want to use cpu, set gpu=-1
gpu = -1
# batchsize
# encoding each token is inefficient
# encoding too many tokens is difficult due to memory
batchsize = 64

dump_token_embeddings(
    vocab_file, options_file, weight_file, token_embedding_file,
    gpu=gpu, batchsize=batchsize
)

###########################################
"""
Differences from usage of character-elmo are only simple two points:
1. use TokenBatcher(vocab_file) instead of Batcher(vocab_file)
2. add token_embedding_file and token batcher for Elmo instantiation
"""

# Create a TokenBatcher to map text to token ids.
batcher = TokenBatcher(vocab_file)  # REQUIRED

# Build the Elmo with biLM and weight layers.
elmo = Elmo(
    options_file,
Example #13
0
for context_sentence in tokenized_context:
    for token in context_sentence:
        all_tokens.add(token)
vocab_file = 'vocab_small.txt'
with open(vocab_file, 'w') as fout:
    fout.write('\n'.join(all_tokens))

# Location of pretrained LM.  Here we use the test fixtures.
datadir = os.path.join('tests', 'fixtures', 'model')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'lm_weights.hdf5')

# Dump the token embeddings to a file. Run this once for your dataset.
token_embedding_file = 'elmo_token_embeddings.hdf5'
dump_token_embeddings(
    vocab_file, options_file, weight_file, token_embedding_file
)
tf.reset_default_graph()



## Now we can do inference.
# Create a TokenBatcher to map text to token ids.
batcher = TokenBatcher(vocab_file)

# Input placeholders to the biLM.
context_token_ids = tf.placeholder('int32', shape=(None, None))
question_token_ids = tf.placeholder('int32', shape=(None, None))

# Build the biLM graph.
bilm = BidirectionalLanguageModel(