def ee(model, data):
    e = Embedder(model)
    sentences = cd.read_col_data(data)
    sids, sents = zip(*[(sent.id, [t.form for t in sent])
                        for sent in sentences])
    print("Embedding...")
    elmos = e.sents2elmo(sents)
    return elmos, sids
Beispiel #2
0
def create_weight_matrix_elmo(word2idx, n_words, vector_dimension=1024):
    elmoEmbedder = Embedder('./elmo')

    notFound = 0
    embedding_matrix = np.zeros((n_words + 1, vector_dimension))

    for word, i in word2idx.items():

        sent = [[word]]
        embedding_vector = elmoEmbedder.sents2elmo(sent)
        embedding_vector = embedding_vector[0][0]
        #print(word)
        #print(i)

        if (embedding_vector is not None):
            embedding_matrix[i] = embedding_vector
        else:
            notFound = notFound + 1

    #print('%s words could not found.' % notFound)
    return embedding_matrix
Beispiel #3
0
def convert(args):
    model_path = args.model_path
    output_path = args.output_path
    config_path = Path(model_path, 'configs')
    config_file = os.listdir(config_path)[0]
    if not config_file.endswith('.json'):
        raise ValueError('Only single config file should be put.')

    with (config_path / config_file).open() as f:
        config = json.load(f)

    # Convert config json
    allennlp_config = convert_config(config)
    config_output_name = 'allennlp_config.json'
    with Path(output_path, config_output_name).open(mode='w') as f:
        json.dump(allennlp_config, f, indent=2)

    # Load char.dic
    with Path(model_path, 'char.dic').open() as f:
        char_dic = {
            line.split('\t')[0]: int(line.split('\t')[1].strip('\n'))
            for line in f
        }

    # Convert ELMo
    embedder = Embedder(model_path)
    model_output_name = 'allennlp_elmo.hdf5'
    output_file = os.path.join(output_path, model_output_name)
    create_elmo_h5_from_embedder(embedder, output_file, config, char_dic)

    # Save Swap char.dic
    top_char = [k for k, v in char_dic.items() if v == 0][0]
    pad_char = '<pad>'
    char_dic[top_char], char_dic[pad_char] = char_dic[pad_char], char_dic[
        top_char]
    with Path(output_path, 'char_for_allennlp.dic').open(mode='w') as f:
        for k, v in sorted([(k, v) for k, v in char_dic.items()],
                           key=lambda x: x[1]):
            f.write('{}\t{}\n'.format(k, v))
Beispiel #4
0
#TODO support batches

import sys
import os
sys.path.append('.')
from ELMoForManyLangs.elmoformanylangs import Embedder

if len(sys.argv) < 3:
    print('please provide embeddings and conl file')
    exit(0)

converter = Embedder(sys.argv[1])
curSent = []
outFile = open(sys.argv[2] + '.elmo', 'w')
for line in open(sys.argv[2]):
    if len(line) < 2:
        sent = [[x[0] for x in curSent]]
        emb = converter.sents2elmo(sent)[0]
        for itemIdx in range(len(curSent)):
            embStr = 'emb=' + ','.join([str(x) for x in emb[itemIdx]])
            outFile.write('\t'.join(curSent[itemIdx] + [embStr]) + '\n')
        outFile.write('\n')
        curSent = []
    else:
        tok = line.strip().split('\t')
        curSent.append(tok)

outFile.close()
Beispiel #5
0
import os

from ELMoForManyLangs.elmoformanylangs import Embedder
from hazm import sent_tokenize
from hazm import word_tokenize
import numpy as np
import pandas as pd
from visualize import visual_cluster_by_hashtags
from visualize import visual_cluster_by_keyword
from visualize import create_word_clod
import util

from sklearn.cluster import Birch
from sklearn.cluster import KMeans

e = Embedder('vecs/elmo-intsa/')

X = []
X_Word_embeding = []
X_LSTM_1 = []
X_LSTM_2 = []
X_avg = []


def _save_embeddings_parts(X_Word_embeding, X_LSTM_1, X_LSTM_2, X_avg,
                           postfix):

    store = pd.HDFStore(
        'instaemb_3hmiddle/insta_embedding_{0}.h5'.format(postfix))
    store['X_Word_embeding'] = pd.DataFrame(X_Word_embeding)
    store['X_LSTM_1'] = pd.DataFrame(X_LSTM_1)
Beispiel #6
0
    for file in list_hd5s:
        store = pd.HDFStore('instaemb4l_shuffle/' + file)
        dfs.append(store[embedding_type])
        if len(dfs) >= 100:
            break
    return pd.concat(dfs)


if __name__ == '__main__':

    # --------------------------------------------4l experiment----------------------------------------------
    datatype = 'hash_shuffle10h'
    # datatype='hashtag3hlast'
    # datatype='hashtag3hmiddle'
    # datatype = 'justtext'
    e = Embedder('vecs/elmo-insta-4l/')
    problems = calculate_embeding(datatype)
    problems = [1558, 9317]

    for num_of_cluster in [100, 200, 300, 400]:
        for embtype in [
                'X_LSTM_1', 'X_LSTM_2', 'X_LSTM_3', 'X_avg', 'X_LSTM_4'
        ]:
            which_hashtag = 'fa_hashtag_list'
            # which_hashtag='main_hashtag_list'

            data = load_data(embedding_type=embtype)
            data.hist("0")
            plt.show()

            data = data.iloc[:, 1024:2048]