def ee(model, data):
    e = Embedder(model)
    sentences = cd.read_col_data(data)
    sids, sents = zip(*[(sent.id, [t.form for t in sent])
                        for sent in sentences])
    print("Embedding...")
    elmos = e.sents2elmo(sents)
    return elmos, sids
Beispiel #2
0
def create_weight_matrix_elmo(word2idx, n_words, vector_dimension=1024):
    elmoEmbedder = Embedder('./elmo')

    notFound = 0
    embedding_matrix = np.zeros((n_words + 1, vector_dimension))

    for word, i in word2idx.items():

        sent = [[word]]
        embedding_vector = elmoEmbedder.sents2elmo(sent)
        embedding_vector = embedding_vector[0][0]
        #print(word)
        #print(i)

        if (embedding_vector is not None):
            embedding_matrix[i] = embedding_vector
        else:
            notFound = notFound + 1

    #print('%s words could not found.' % notFound)
    return embedding_matrix
Beispiel #3
0
#TODO support batches

import sys
import os
sys.path.append('.')
from ELMoForManyLangs.elmoformanylangs import Embedder

if len(sys.argv) < 3:
    print('please provide embeddings and conl file')
    exit(0)

converter = Embedder(sys.argv[1])
curSent = []
outFile = open(sys.argv[2] + '.elmo', 'w')
for line in open(sys.argv[2]):
    if len(line) < 2:
        sent = [[x[0] for x in curSent]]
        emb = converter.sents2elmo(sent)[0]
        for itemIdx in range(len(curSent)):
            embStr = 'emb=' + ','.join([str(x) for x in emb[itemIdx]])
            outFile.write('\t'.join(curSent[itemIdx] + [embStr]) + '\n')
        outFile.write('\n')
        curSent = []
    else:
        tok = line.strip().split('\t')
        curSent.append(tok)

outFile.close()