Esempi in Python per read, esempi in Python per pyemblib.read

Esempio n. 1

0

Mostra file

File: preprocessing.py Progetto: brendanxwhitaker/embedding-encoder

def get_embedding_dict(emb_path, emb_format, first_n, vocab):

    print("Preprocessing. ")
    file_name_length = len(emb_path)
    extension = os.path.basename(emb_path).split('.')[-1]

    # Decide if it's a binary or text embedding file, and read in
    # the embedding as a dict object, where the keys are the tokens
    # (strings), and the values are the components of the corresponding
    # vectors (floats).
    embedding = {}
    read_mode = None
    if first_n == 0 or emb_format == pyemblib.Format.Glove:

        print("No value passed for first_n or feature not supported. ")
        first_n = None
    if extension == 'bin':
        read_mode = pyemblib.Mode.Binary
        binary = True
        print("binary reac.")
    elif extension == 'txt':
        read_mode = pyemblib.Mode.Text
        binary = False
        print("text read.")
    else:
        print("Unsupported embedding mode. ")
        exit()
    ''' 
    if emb_format == pyemblib.Format.Glove:
        embedding = loadGloveModel(emb_path)
    '''

    if first_n:
        embedding = pyemblib.read(
            emb_path,
            format=emb_format,
            mode=read_mode,
            first_n=first_n,
            replace_errors=True,
            skip_parsing_errors=True,
        )
    else:
        embedding = pyemblib.read(
            emb_path,
            format=emb_format,
            mode=read_mode,
            replace_errors=True,
            skip_parsing_errors=True,
        )

    return embedding

Esempio n. 2

0

Mostra file

File: app.py Progetto: brendanxwhitaker/holoviz

def loadEmbeddings():
    embeddings = pyemblib.read(embedding_file, mode=pyemblib.Mode.Text)
    keys = list(embeddings.keys())
    values = np.array(list(embeddings.values()))
    mds = MDS(n_components=3)
    values_transformed = mds.fit_transform(values)

    transformed = {}
    for i in range(len(keys)):
        transformed[keys[i]] = values_transformed[i].tolist()
    return render_template('look_at.html', data=transformed)

Esempio n. 3

0

Mostra file

File: convert_embedding.py Progetto: brendanxwhitaker/embedding-encoder

def read_embedding(emb_path):

    print("READING. ")
    file_name_length = len(emb_path)
    last_char = emb_path[file_name_length - 1]

    # Decide if it's a binary or text embedding file, and read in
    # the embedding as a dict object, where the keys are the tokens
    # (strings), and the values are the components of the corresponding
    # vectors (floats).
    embedding = {}
    if (last_char == 'n'):
        embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary)
    elif (last_char == 't'):
        embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text)
    else:
        print("Unsupported embedding format. ")
        exit()

    return embedding

Esempio n. 4

0

Mostra file

File: preprocessing.py Progetto: hutengdai/geometric-embedding-properties

def subset_embedding(emb_path, first_n, vocab):

    print("Preprocessing. ")
    file_name_length = len(emb_path)
    last_char = emb_path[file_name_length - 1]

    # Decide if it's a binary or text embedding file, and read in
    # the embedding as a dict object, where the keys are the tokens
    # (strings), and the values are the components of the corresponding
    # vectors (floats).
    embedding = {}
    if (last_char == 'n'):
        embedding = pyemblib.read(emb_path,
                                  mode=pyemblib.Mode.Binary,
                                  first_n=first_n)
    elif (last_char == 't'):
        embedding = pyemblib.read(emb_path,
                                  mode=pyemblib.Mode.Text,
                                  first_n=first_n)
    else:
        print("Unsupported embedding format. ")
        exit()

    # make sure it has a valid file extension
    extension = emb_path[file_name_length - 4:file_name_length]
    if extension != ".txt" and extension != ".bin":
        print("Invalid file path. ")
        exit()

    # get the emb_path without the file extension
    path_no_ext = emb_path[0:file_name_length - 4]
    new_path = path_no_ext + "_SUBSET.txt"

    # write to text embedding file
    pyemblib.write(embedding, new_path, mode=pyemblib.Mode.Text)

    return

Esempio n. 5

0

Mostra file

File: nearest_neighbors.py Progetto: drgriffis/NeuralVecmap

                help='number of threads to use for parallel calculation (default: %default)',
                type='int', default=1)
        parser.add_option('--batch-size', dest='batch_size',
                help='number of samples to process in each batch (default: %default)',
                type='int', default=25)
        parser.add_option('--keys', dest='keysf',
                help='file listing keys to restrict NN analysis to')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        embf, outf = args
        return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile
    embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli()

    if keysf:
        keys = readKeys(keysf)
        print("Read %d keys to restrict to" % len(keys))
    else:
        keys = None

    t = log.startTimer('Reading embeddings...', newline=False)
    embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True)
    log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds))

    nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads)
    log.writeln('Wrote nearest neighbors to %s.' % outf)

Esempio n. 6

0

Mostra file

File: preprocessing.py Progetto: hutengdai/geometric-embedding-properties

def process_embedding(emb_path, emb_format, first_n, vocab):

    print("Preprocessing. ")
    file_name_length = len(emb_path)
    extension = os.path.basename(emb_path).split('.')[-1]

    # Decide if it's a binary or text embedding file, and read in
    # the embedding as a dict object, where the keys are the tokens
    # (strings), and the values are the components of the corresponding
    # vectors (floats).
    embedding = {}
    read_mode = None
    if first_n == 0 or emb_format == pyemblib.Format.Glove:

        print("No value passed for first_n or feature not supported. ")
        first_n = None
    if extension == 'bin':
        read_mode = pyemblib.Mode.Binary
        binary = True
        print("binary reac.")
    elif extension == 'txt':
        read_mode = pyemblib.Mode.Text
        binary = False
        print("text read.")
    else:
        print("Unsupported embedding mode. ")
        exit()
    ''' 
    if emb_format == pyemblib.Format.Glove:
        embedding = loadGloveModel(emb_path)
    '''

    if first_n:
        embedding = pyemblib.read(
            emb_path,
            format=emb_format,
            mode=read_mode,
            first_n=first_n,
            replace_errors=True,
            skip_parsing_errors=True,
        )
    else:
        embedding = pyemblib.read(
            emb_path,
            format=emb_format,
            mode=read_mode,
            replace_errors=True,
            skip_parsing_errors=True,
        )

    # take a subset of the vocab
    new_embedding = {}
    if (vocab != None):
        for word in vocab:
            if word in embedding:
                vector = embedding[word]
                new_embedding.update({word: vector})
        embedding = new_embedding

    # convert embedding to pandas dataframe
    # "words_with_friends" is the column label for the vectors
    # this df has shape [num_inputs,2] since the vectors are all in 1
    # column as length d lists
    emb_array = np.array(embedding.items())
    sys.stdout.flush()

    label_array = np.array([row[0] for row in emb_array.tolist()])
    sys.stdout.flush()

    vectors_matrix = np.array([row[1:] for row in emb_array.tolist()])
    vectors_matrix = np.array([row[0] for row in vectors_matrix])
    sys.stdout.flush()
    '''
    emb_df = pd.Series(embedding, name="words_with_friends")
    # print(emb_df.head(10))

    # reset the index of the dataframe
    emb_df = emb_df.reset_index()
    # print(emb_df.head(10))

    # matrix of just the vectors
    emb_matrix = emb_df.words_with_friends.values.tolist()
    # print(emb_matrix[0:10])

    # dataframe of just the vectors
    vectors_df = pd.DataFrame(emb_matrix,index=emb_df.index)
    # print(vectors_df.head(10))

    # numpy matrix of just the vectors
    vectors_matrix = vectors_df.as_matrix()
    # print(vectors_matrix[0:10])
    '''

    return vectors_matrix, label_array

Esempio n. 7

0

Mostra file

File: learnmap.py Progetto: drgriffis/NeuralVecmap

            parser.print_help()
            exit()
        return options

    options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    # set the random seed here if necessary
    if options.random_seed <= 0:
        options.random_seed = int(time.time())

    t_sub = log.startTimer('Reading source embeddings from %s...' %
                           options.src_embf,
                           newline=False)
    src_embs = pyemblib.read(options.src_embf,
                             mode=options.src_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(src_embs))

    t_sub = log.startTimer('Reading target embeddings from %s...' %
                           options.trg_embf,
                           newline=False)
    trg_embs = pyemblib.read(options.trg_embf,
                             mode=options.trg_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(trg_embs))

    pivots = readPivotsFile(options.pivotf, tolower=True)
    log.writeln('Loaded %d pivot terms.' % len(pivots))

Esempio n. 8

0

Mostra file

        if len(args) != 0 or \
                (not options.src_embf) or \
                ( options.frequent and (not options.trg_vocabf) ) or \
                ( (not options.trg_embf) and (not options.trg_vocabf) ) or \
                (not options.outf):
            parser.print_help()
            exit()
        return options
    options = _cli()

    if options.stopwordf:
        stopwords = readStopwords(options.stopwordf, tolower=True)
    else:
        stopwords = set()

    src_embs = pyemblib.read(options.src_embf, mode=options.src_embf_mode)
    src_vocab = set([k.lower() for k in src_embs.keys()])

    if options.trg_vocabf:
        trg_vocab = readVocab(options.trg_vocabf, tolower=True)
        if not options.frequent:
            trg_vocab = set(trg_vocab.keys())
    else:
        trg_embs = pyemblib.read(options.trg_embf, mode=options.trg_embf_mode)
        trg_vocab = set([k.lower() for k in trg_embs.keys()])

    if options.frequent:
        pivots = frequentPivotTerms(src_vocab, trg_vocab, num_terms=options.num_pivots, stopwords=stopwords)
    else:
        pivots = randomPivotTerms(src_vocab, trg_vocab, num_terms=options.num_pivots, stopwords=stopwords)

Esempio n. 9

0

Mostra file

File: experiment.py Progetto: OSU-slatelab/analogy-task

    # only one valid data mode for Google and BATS datasets
    if options.dataset in [datasets.Google, datasets.BATS
                           ] and options.anlg_type != data_mode.String:
        log.writeln(
            '[WARNING] Invalid --analogy-type setting for %s dataset; Overriding to "%s"'
            % (options.dataset, data_mode.String))
        options.anlg_type = data_mode.String

    t_sub = log.startTimer('Reading %s embeddings from %s...' %
                           (options.embeddings_mode, options.embeddings))
    separator = '\t' if options.tab_sep else ' '
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.embeddings_mode)
    embeddings = pyemblib.read(options.embeddings,
                               format=fmt,
                               mode=mode,
                               separator=separator,
                               lower_keys=options.to_lower)
    log.stopTimer(
        t_sub,
        'Read {0:,} embeddings in {1}s.\n'.format(len(embeddings), '{0:.2f}'))

    t_sub = log.startTimer('Running analogy task on %s dataset...' %
                           options.dataset)
    results = analogyTask(analogy_file,
                          options.dataset,
                          options.setting,
                          options.anlg_type,
                          embeddings,
                          log=log,
                          predictions_file=options.predictions_file,

Esempio n. 10

0

Mostra file

            exit()
        (mentionf, ) = args
        return mentionf, options

    mentionf, options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    if options.tab_separated:
        sep = '\t'
    else:
        sep = ' '

    t_sub = log.startTimer('Reading entity embeddings from %s...' %
                           options.entity_embf,
                           newline=False)
    entity_embeds = pyemblib.read(options.entity_embf, separator=sep)
    log.stopTimer(t_sub,
                  message='Read %d embeddings ({0:.2f}s)' % len(entity_embeds))

    t_sub = log.startTimer('Reading context embeddings from %s...' %
                           options.ctx_embf,
                           newline=False)
    ctx_embeds = pyemblib.read(options.ctx_embf, separator=sep)
    log.stopTimer(t_sub,
                  message='Read %d embeddings ({0:.2f}s)' % len(ctx_embeds))

    if options.entity_defnf:
        t_sub = log.startTimer('Reading word embeddings from %s...' %
                               options.word_embf,
                               newline=False)
        #word_embeds = pyemblib.read(options.word_embf)

Esempio n. 11

0

Mostra file

File: wsd_baseline_experiment.py Progetto: drgriffis/ELMo-WSD-reimplementation

                                                   get_lemmas=True)
    log.writeln('Mapped dataset info for {0:,} mentions.\n'.format(
        len(mention_map)))

    if options.wordnet_baseline_eval_predictions:
        wordnetFirstSenseBaseline(mentions, mention_map,
                                  options.wordnet_baseline_eval_predictions)
    if options.elmo_baseline_eval_predictions:
        log.writeln('Reading set of training lemmas from %s...' %
                    options.training_lemmasf)
        training_lemmas = readTrainingLemmas(options.training_lemmasf)
        log.writeln('Read {0:,} lemmas.\n'.format(len(training_lemmas)))

        log.writeln('Reading SemCor sense embeddings from %s...' %
                    options.semcor_embf)
        semcor_embeddings = pyemblib.read(options.semcor_embf)
        log.writeln('Read embeddings for {0:,} senses.\n'.format(
            len(semcor_embeddings)))

        log.writeln('Reading backoff predictions from %s...' %
                    options.wordnet_baseline_input_predictions)
        wn_first_sense_preds = loadWSDFrameworkPredictions(
            options.wordnet_baseline_input_predictions)
        log.writeln('Read predictions for {0:,} samples.\n'.format(
            len(wn_first_sense_preds)))

        ELMoBaseline(mentions, mention_map, wn_first_sense_preds,
                     training_lemmas, semcor_embeddings,
                     options.elmo_baseline_eval_predictions)

    log.stop()

Esempio n. 12

0

Mostra file

File: nn_saver.py Progetto: drgriffis/knn-embedding

    log.start(logfile=options.logfile)
    configlogger.writeConfig(
        log, [
            ('Input embedding file', embf),
            ('Input embedding file mode', options.embedding_mode),
            ('Output neighbor file', options.outputf),
            ('Ordered vocabulary file', options.vocabf),
            ('Number of nearest neighbors', options.k),
            ('Batch size', options.batch_size),
            ('Number of threads', options.threads),
            ('Partial nearest neighbors file for resuming',
             options.partial_neighbors_file),
        ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings in {1}s.\n'.format(
                      len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' %
                    options.vocabf)
        writeNodeMap(emb, options.vocabf)
    else:
        log.writeln('Reading node ID <-> vocab map from %s...\n' %
                    options.vocabf)
    node_map = readNodeMap(options.vocabf)

    # get the vocabulary in node ID order, and map index in emb_arr
    # to node IDs

Esempio n. 13

0

Mostra file

from sklearn.manifold import MDS
import numpy as np
import json
import pyemblib
import random

embedding_file = "top_10000_emb.txt"

include = set(['red', 'black', 'green', 'orange', 'apple', 'king', 'queen', 'man', 'woman', 'moscow', 'russia', 'tokyo', 'japan'])

embeddings = pyemblib.read(embedding_file, mode=pyemblib.Mode.Text)
keys = list(embeddings.keys())
values = list(embeddings.values())

new_vals, new_keys = [], []

for i in range(len(keys)):
  if keys[i] in include or random.random() < 0.05: 
    new_keys.append(keys[i])
    new_vals.append(values[i])

values = np.array(new_vals)
keys = new_keys

mds = MDS(n_components=3)
realspace = mds.fit_transform(values)

f = open('3dembeddings.csv', 'w')
for i in range(len(keys)):
  f.write(keys[i] + ',' + str(realspace[i][0]) + ',' + str(realspace[i][1]) + ',' + str(realspace[i][2]) + '\n')
f.close()

Esempio n. 14

0

Mostra file

File: experiment.py Progetto: drgriffis/wordnet-cnn-classification

            ('Patience', options.patience),
            ('Early stopping criterion', options.early_stopping),
            ('Max training epochs', options.max_epochs),
            ('Checkpoint file', options.checkpoint_path),
            ('Cross validation splits file', options.cross_validation_file),
            ('Number of folds', options.n_folds),
            ('Fraction of training used for dev', options.dev_size),
            ('Writing predictions to', options.predictions_file),
            ('Writing dev results to', options.dev_results_file),
            ('Random seed', options.random_seed),
        ]),
    ], 'WordNet classification experiment')

    t_sub = log.startTimer('Reading word embeddings from %s...' %
                           options.embedding_f)
    embeddings = pyemblib.read(options.embedding_f)
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings ({1}s).\n'.format(
                      len(embeddings), '{0:.2f}'))

    log.writeln('Reading dataset from %s...' % dataset_f)
    ds = dataset.load(dataset_f)
    log.writeln('Read {0:,} samples.\n'.format(len(ds)))

    preprocessed = preprocess(ds, embeddings, options)

    if options.predictions_file:
        preds_stream = codecs.open(options.predictions_file, 'w', 'utf-8')

    runCrossValidationExperiment(preprocessed, options, preds_stream)

Esempio n. 15

0

Mostra file

File: sim353.py Progetto: brendanxwhitaker/intrinsic-tasks

def testFunc():
    print('testFun')

wordsim = open("wordsim353_agreed.txt")

print("Preprocessing. ")
file_name_length = len(emb_path)
last_char = emb_path[file_name_length - 1]

# Decide if it's a binary or text embedding file, and read in
# the embedding as a dict object, where the keys are the tokens
# (strings), and the values are the components of the corresponding 
# vectors (floats).
embedding = {}
if (last_char == 'n'):
    embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary, replace_errors=True)
elif (last_char == 't'):
    embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text, replace_errors=True)
else:
    print("Unsupported embedding format. ")
    exit()

print("Source: ", emb_path)


result = []
humanRank = []
cosineRank = []
numMiss = 0
full_vocab = []

Esempio n. 16

0

Mostra file

File: binary_read_test.py Progetto: brendanxwhitaker/embedding-encoder

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

import pyemblib
import gensim
import os

parent = os.path.abspath("../../embeddings/") 
path = os.path.abspath("../../embeddings/GoogleNews-vectors-negative300.bin")
glove = os.path.abspath("../../embeddings/glove.840B.300d.txt") 
wikitext_path = os.path.abspath("../../embeddings/wiki-news-300d-1M-subword.vec") 

# gensim working. 
# google_news = KeyedVectors.load_word2vec_format(path, binary=True)
glove2word2vec(glove_input_file=glove, word2vec_output_file=os.path.join(parent, "glove.840B.300d_Word2Vec_format.txt"))


# pyemblib not working. 
# embedding = pyemblib.read(path, format='Word2Vec', mode=pyemblib.Mode.Binary)



wikitext = pyemblib.read(wikitext_path, format='Word2Vec', mode=pyemblib.Mode.Text)
print("Successful read.")

Esempio n. 17

0

Mostra file

File: sklearn_classifiers.py Progetto: LanguageAndIntelligence/automated-ICF-coding

                ('Using Action oracle', options.action_oracle),
                ('Input predictions file', options.input_predsf),
                ('Pre-embedded mentions', options.pre_embedded),
            ]),
        ],
        title=
        "Entity linking (disambiguation) experiment using scikit-learn baseline algorithms"
    )

    ## Data loading/setup
    entity_embeds = []
    for i in range(len(options.entity_embfs)):
        f = options.entity_embfs[i]
        t_sub = log.startTimer(
            'Reading set %d of entity embeddings from %s...' % (i + 1, f))
        entity_embeds.append(pyemblib.read(f, lower_keys=True))
        log.stopTimer(t_sub,
                      message='Read %s embeddings ({0:.2f}s)\n' %
                      ('{0:,}'.format(len(entity_embeds[-1]))))

    if options.word_vocabf:
        t_sub = log.startTimer('Reading word/context vocabulary from %s...' %
                               options.word_vocabf)
        word_vocab = readVocab(options.word_vocabf)
        log.stopTimer(t_sub,
                      message='Read %s words ({0:.2f}s)\n' %
                      ('{0:,}'.format(len(word_vocab))))
    else:
        word_vocab = None

    if options.use_ctx_embeddings:

Esempio n. 18

0

Mostra file

def getEmbeddings(options, log=log, separator=' '):
    word_embs, term_embs, ent_embs = None, None, None
    word_ids, term_ids, ent_ids = None, None, None

    # load in embeddings
    if options.ent_embf:
        if options.ent_filterf: filter_set = readFilterSet(options.ent_filterf)
        else: filter_set = None
        t_sub = log.startTimer('Reading entity embeddings from %s...' %
                               options.ent_embf,
                               newline=False)
        ent_embs = pyemblib.read(options.ent_embf,
                                 separator=separator,
                                 replace_errors=True,
                                 filter_to=filter_set,
                                 lower_keys=True)
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(ent_embs))
        ent_ids = ent_embs.keys()
    if options.term_embf:
        if options.term_filterf:
            filter_set = readFilterSet(options.term_filterf)
        else:
            filter_set = None
        t_sub = log.startTimer('Reading term embeddings from %s...' %
                               options.term_embf,
                               newline=False)
        term_embs = pyemblib.read(options.term_embf,
                                  separator=separator,
                                  replace_errors=True,
                                  filter_to=filter_set,
                                  lower_keys=True)
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(term_embs))
        term_ids = term_embs.keys()
    if options.word_embf:
        if options.word_filterf:
            filter_set = readFilterSet(options.word_filterf)
        else:
            filter_set = None
        t_sub = log.startTimer('Reading word embeddings from %s...' %
                               options.word_embf,
                               newline=False)
        word_embs = pyemblib.read(options.word_embf,
                                  separator=separator,
                                  replace_errors=True,
                                  filter_to=filter_set,
                                  lower_keys=(not options.keep_word_case))
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(word_embs))
        word_ids = word_embs.keys()

    # load in term/string maps
    if options.termmapf:
        t_sub = log.startTimer(
            'Reading term-entity mappings from %s (separated by "%s")...' %
            (options.termmapf, options.term_map_sep),
            newline=False)
        term_entity_map = readTermEntityMap(options.termmapf,
                                            entity_ids=ent_ids,
                                            term_ids=term_ids,
                                            map_sep=options.term_map_sep)
        log.stopTimer(t_sub,
                      message='Read mappings for %d terms ({0:.2f}s)' %
                      len(term_entity_map))
    if options.strmapf:
        t_sub = log.startTimer('Reading term-string mappings from %s...' %
                               options.strmapf,
                               newline=False)
        term_string_map = readTermStringMap(options.strmapf, term_ids=term_ids)
        log.stopTimer(t_sub,
                      message='Read mappings for %d terms ({0:.2f}s)' %
                      len(term_string_map))

    # perform actual approximations
    if options.repr_method == ENTITY:
        emb_wrapper = EmbeddingWrapper(options.repr_method,
                                       ent_embs,
                                       indexed=True)

    elif options.repr_method == TERM:
        # rekey term embeddings
        new_term_embs = {}
        for (term_id, term_emb) in term_embs.items():
            term_str = term_string_map.get(term_id, None)
            if term_str:
                new_term_embs[term_str] = term_emb
        emb_wrapper = EmbeddingWrapper(options.repr_method,
                                       new_term_embs,
                                       backoff_embeds=word_embs,
                                       indexed=True)

    elif options.repr_method == WORD:
        if options.term_embf:
            raise Exception("Honestly, I don't think this setting is used.")
        else:
            emb_wrapper = EmbeddingWrapper(options.repr_method,
                                           word_embs,
                                           indexed=True)

    else:
        raise Exception("Huh? %s" % options.repr_method)

    return emb_wrapper

Esempio n. 19

0

Mostra file

            exit()
        return options

    options = _cli()

    log.start(options.logfile)
    log.writeConfig([
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')

    log.stop()

Esempio n. 20

0

Mostra file

File: filter_embeddings_to_dataset.py Progetto: drgriffis/wordnet-cnn-classification

                default=None)
        (options, args) = parser.parse_args()
        if (not options.inputf) or (not options.outputf) or (not options.datasetf):
            parser.print_help()
            exit()
        return options
    options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings file', options.inputf),
        ('Output embeddings file', options.outputf),
        ('Dataset file', options.datasetf),
    ], 'Embedding filtering for WordNet classification experiments')

    t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf)
    embeddings = pyemblib.read(options.inputf)
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(
        len(embeddings), '{0:.2f}'
    ))

    log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf)
    ds = dataset.load(options.datasetf)
    vocab = set()
    for (_, src, snk, _) in ds:
        vocab.add(src)
        vocab.add(snk)
    log.writeln('Found {0:,} unique words in {1:,} samples.\n'.format(
        len(vocab), len(ds)
    ))

    log.writeln('Filtering embeddings...')