Exemple #1
0
def prepare_elmo_features(path, dataset, vocab_file, options_file,
                          weight_file):
    """ Dump the embeddings to a file.

  Parameters
  ----------
  path: str
  dataset: str
  vocab_file: str
  options_file: str
  weight_file: str

  """

    embedding_file = os.path.join(path, "X_elmo_" + dataset + ".hdf5")
    dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         embedding_file)
Exemple #2
0
    def process_batch(self, sentences):
        tokenized_context = [
            sentence.strip().split() for sentence in sentences
        ]
        freq_map = {}

        for i in range(0, len(tokenized_context)):
            for j in range(0, len(tokenized_context[i])):
                key = tokenized_context[i][j]
                if key in freq_map:
                    freq_map[key] = freq_map[key] + 1.0
                else:
                    freq_map[key] = 1.0

        embedding_map = dump_bilm_embeddings(self.vocab_file, sentences,
                                             self.options_file,
                                             self.weight_file)

        ret_map = {}
        for sent_id in range(0, len(sentences)):
            sent_embedding = embedding_map[sent_id]
            for i in range(0, len(tokenized_context[sent_id])):
                key = tokenized_context[sent_id][i]
                concat = np.concatenate([
                    sent_embedding[0][i], sent_embedding[1][i],
                    sent_embedding[2][i]
                ])
                if key in ret_map:
                    ret_map[key] = ret_map[key] + concat
                else:
                    ret_map[key] = concat
                assert (len(ret_map[key]) == 3 * 1024)
        ret_map_avg = {}
        for key in ret_map:
            dividend = freq_map[key]
            ret_map_avg[key] = list(ret_map[key] / dividend)
        tf.reset_default_graph()
        return ret_map_avg
                    help='Minibatch size of computation')
parser.add_argument('--input',
                    '-in',
                    '-i',
                    required=True,
                    help='Path of input text file')
parser.add_argument('--output',
                    '-out',
                    '-o',
                    required=True,
                    help='Path of output file to be written')
args = parser.parse_args()
print(json.dumps(args.__dict__, indent=2))

# Location of pretrained LM.
vocab_file = 'vocab-2016-09-10.txt'
options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

dataset_file = args.input
embedding_file = args.output
assert args.input != args.output

dump_bilm_embeddings(vocab_file,
                     dataset_file,
                     options_file,
                     weight_file,
                     embedding_file,
                     gpu=args.gpu,
                     batchsize=args.batchsize)
Exemple #4
0
# Our small dataset.
raw_context = [
    'Pretrained biLMs compute representations useful for NLP tasks .',
    'They give state of the art performance for many tasks .'
]
tokenized_context = [sentence.split() for sentence in raw_context]
tokenized_question = [
    ['What', 'are', 'biLMs', 'useful', 'for', '?'],
]

# Create the dataset file.
dataset_file = 'dataset_file.txt'
with open(dataset_file, 'w') as fout:
    for sentence in tokenized_context + tokenized_question:
        fout.write(' '.join(sentence) + '\n')

# Location of pretrained LM.  Here we use the test fixtures.
datadir = os.path.join('tests', 'fixtures', 'model')
vocab_file = os.path.join(datadir, 'vocab_test.txt')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'lm_weights.hdf5')

# Dump the embeddings to a file. Run this once for your dataset.
embedding_file = 'elmo_embeddings.hdf5'
dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                     embedding_file)

# Load the embeddings from the file -- here the 2nd sentence.
with h5py.File(embedding_file, 'r') as fin:
    second_sentence_embeddings = fin['1'][...]
Exemple #5
0
    'They give state of the art performance for many tasks .'
]
tokenized_context = [sentence.split() for sentence in raw_context]
tokenized_question = [
    ['What', 'are', 'biLMs', 'useful', 'for', '?'],
]

# Create the dataset file.
dataset_file = 'dataset_file.txt'
with open(dataset_file, 'w') as fout:
    for sentence in tokenized_context + tokenized_question:
        fout.write(' '.join(sentence) + '\n')


# Location of pretrained LM.  Here we use the test fixtures.
datadir = os.path.join('tests', 'fixtures', 'model')
vocab_file = os.path.join(datadir, 'vocab_test.txt')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'lm_weights.hdf5')

# Dump the embeddings to a file. Run this once for your dataset.
embedding_file = 'elmo_embeddings.hdf5'
dump_bilm_embeddings(
    vocab_file, dataset_file, options_file, weight_file, embedding_file
)

# Load the embeddings from the file -- here the 2nd sentence.
with h5py.File(embedding_file, 'r') as fin:
    second_sentence_embeddings = fin['1'][...]

Exemple #6
0
        fout.write(' '.join(sentence) + '\n')


# Location of pretrained LM.
vocab_file = 'vocab-2016-09-10.txt'
options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

# Dump the embeddings to a file. Run this once for your dataset.
embedding_file = 'elmo_embeddings.hdf5'

# gpu id
# if you want to use cpu, set gpu=-1
gpu = -1
# batchsize
# encoding each token is inefficient
# encoding too many tokens is difficult due to memory
batchsize = 32

dump_bilm_embeddings(
    vocab_file, dataset_file, options_file, weight_file, embedding_file,
    gpu=gpu, batchsize=batchsize
)

# Load the embeddings from the file -- here the 2nd sentence.
with h5py.File(embedding_file, 'r') as fin:
    second_sentence_embeddings = fin['1'][...]
    print(second_sentence_embeddings.shape)
    # (n_layers=3, sequence_length, embedding_dim)
    print(second_sentence_embeddings)
Exemple #7
0
#         for sentence in data:
#             fout.write(sentence + '\n')

# Location of pretrained LM.  Here we use the test fixtures.
model_dir = '/crs_elmo/bilm-tf/model/official/small'
vocab_file = os.path.join(model_dir, 'vocab-2016-09-10.txt')
elmo_options_file = os.path.join(
    model_dir, 'elmo_2x1024_128_2048cnn_1xhighway_options.json')
elmo_weight_file = os.path.join(
    model_dir, 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')
data_dir = '/crs_elmo/downstream_data/XNLI'
max_seq_len = 50

# Dump the embeddings to a file. Run this once for your dataset.
embedding_files = [
    'train_elmo_a.hdf5', 'train_elmo_b.hdf5', 'dev_elmo_a.hdf5',
    'dev_elmo_b.hdf5'
]
# for dataset_file, embedding_file in zip(dataset_files, embedding_files):
dataset_file = dataset_files[1]
embedding_file = embedding_files[1]
print(dataset_file, embedding_file)
dataset_file = os.path.join(data_dir, dataset_file)
dump_bilm_embeddings(vocab_file, dataset_file, elmo_options_file,
                     elmo_weight_file, embedding_file, max_seq_len)

# Load the embeddings from the file -- here the 2nd sentence.
# with h5py.File(os.path.join(data_dir, embedding_files[0]), 'r') as fin:
#     print("shape: ", fin.shape)
#     print(fin['0'])