def get_embedding_dict(emb_path, emb_format, first_n, vocab): print("Preprocessing. ") file_name_length = len(emb_path) extension = os.path.basename(emb_path).split('.')[-1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} read_mode = None if first_n == 0 or emb_format == pyemblib.Format.Glove: print("No value passed for first_n or feature not supported. ") first_n = None if extension == 'bin': read_mode = pyemblib.Mode.Binary binary = True print("binary reac.") elif extension == 'txt': read_mode = pyemblib.Mode.Text binary = False print("text read.") else: print("Unsupported embedding mode. ") exit() ''' if emb_format == pyemblib.Format.Glove: embedding = loadGloveModel(emb_path) ''' if first_n: embedding = pyemblib.read( emb_path, format=emb_format, mode=read_mode, first_n=first_n, replace_errors=True, skip_parsing_errors=True, ) else: embedding = pyemblib.read( emb_path, format=emb_format, mode=read_mode, replace_errors=True, skip_parsing_errors=True, ) return embedding
def loadEmbeddings(): embeddings = pyemblib.read(embedding_file, mode=pyemblib.Mode.Text) keys = list(embeddings.keys()) values = np.array(list(embeddings.values())) mds = MDS(n_components=3) values_transformed = mds.fit_transform(values) transformed = {} for i in range(len(keys)): transformed[keys[i]] = values_transformed[i].tolist() return render_template('look_at.html', data=transformed)
def read_embedding(emb_path): print("READING. ") file_name_length = len(emb_path) last_char = emb_path[file_name_length - 1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} if (last_char == 'n'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary) elif (last_char == 't'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text) else: print("Unsupported embedding format. ") exit() return embedding
def subset_embedding(emb_path, first_n, vocab): print("Preprocessing. ") file_name_length = len(emb_path) last_char = emb_path[file_name_length - 1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} if (last_char == 'n'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary, first_n=first_n) elif (last_char == 't'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text, first_n=first_n) else: print("Unsupported embedding format. ") exit() # make sure it has a valid file extension extension = emb_path[file_name_length - 4:file_name_length] if extension != ".txt" and extension != ".bin": print("Invalid file path. ") exit() # get the emb_path without the file extension path_no_ext = emb_path[0:file_name_length - 4] new_path = path_no_ext + "_SUBSET.txt" # write to text embedding file pyemblib.write(embedding, new_path, mode=pyemblib.Mode.Text) return
help='number of threads to use for parallel calculation (default: %default)', type='int', default=1) parser.add_option('--batch-size', dest='batch_size', help='number of samples to process in each batch (default: %default)', type='int', default=25) parser.add_option('--keys', dest='keysf', help='file listing keys to restrict NN analysis to') parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 2: parser.print_help() exit() embf, outf = args return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli() if keysf: keys = readKeys(keysf) print("Read %d keys to restrict to" % len(keys)) else: keys = None t = log.startTimer('Reading embeddings...', newline=False) embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True) log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds)) nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads) log.writeln('Wrote nearest neighbors to %s.' % outf)
def process_embedding(emb_path, emb_format, first_n, vocab): print("Preprocessing. ") file_name_length = len(emb_path) extension = os.path.basename(emb_path).split('.')[-1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} read_mode = None if first_n == 0 or emb_format == pyemblib.Format.Glove: print("No value passed for first_n or feature not supported. ") first_n = None if extension == 'bin': read_mode = pyemblib.Mode.Binary binary = True print("binary reac.") elif extension == 'txt': read_mode = pyemblib.Mode.Text binary = False print("text read.") else: print("Unsupported embedding mode. ") exit() ''' if emb_format == pyemblib.Format.Glove: embedding = loadGloveModel(emb_path) ''' if first_n: embedding = pyemblib.read( emb_path, format=emb_format, mode=read_mode, first_n=first_n, replace_errors=True, skip_parsing_errors=True, ) else: embedding = pyemblib.read( emb_path, format=emb_format, mode=read_mode, replace_errors=True, skip_parsing_errors=True, ) # take a subset of the vocab new_embedding = {} if (vocab != None): for word in vocab: if word in embedding: vector = embedding[word] new_embedding.update({word: vector}) embedding = new_embedding # convert embedding to pandas dataframe # "words_with_friends" is the column label for the vectors # this df has shape [num_inputs,2] since the vectors are all in 1 # column as length d lists emb_array = np.array(embedding.items()) sys.stdout.flush() label_array = np.array([row[0] for row in emb_array.tolist()]) sys.stdout.flush() vectors_matrix = np.array([row[1:] for row in emb_array.tolist()]) vectors_matrix = np.array([row[0] for row in vectors_matrix]) sys.stdout.flush() ''' emb_df = pd.Series(embedding, name="words_with_friends") # print(emb_df.head(10)) # reset the index of the dataframe emb_df = emb_df.reset_index() # print(emb_df.head(10)) # matrix of just the vectors emb_matrix = emb_df.words_with_friends.values.tolist() # print(emb_matrix[0:10]) # dataframe of just the vectors vectors_df = pd.DataFrame(emb_matrix,index=emb_df.index) # print(vectors_df.head(10)) # numpy matrix of just the vectors vectors_matrix = vectors_df.as_matrix() # print(vectors_matrix[0:10]) ''' return vectors_matrix, label_array
parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile, stdout_also=True) # set the random seed here if necessary if options.random_seed <= 0: options.random_seed = int(time.time()) t_sub = log.startTimer('Reading source embeddings from %s...' % options.src_embf, newline=False) src_embs = pyemblib.read(options.src_embf, mode=options.src_embf_mode, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings in {0:.2f}s' % len(src_embs)) t_sub = log.startTimer('Reading target embeddings from %s...' % options.trg_embf, newline=False) trg_embs = pyemblib.read(options.trg_embf, mode=options.trg_embf_mode, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings in {0:.2f}s' % len(trg_embs)) pivots = readPivotsFile(options.pivotf, tolower=True) log.writeln('Loaded %d pivot terms.' % len(pivots))
if len(args) != 0 or \ (not options.src_embf) or \ ( options.frequent and (not options.trg_vocabf) ) or \ ( (not options.trg_embf) and (not options.trg_vocabf) ) or \ (not options.outf): parser.print_help() exit() return options options = _cli() if options.stopwordf: stopwords = readStopwords(options.stopwordf, tolower=True) else: stopwords = set() src_embs = pyemblib.read(options.src_embf, mode=options.src_embf_mode) src_vocab = set([k.lower() for k in src_embs.keys()]) if options.trg_vocabf: trg_vocab = readVocab(options.trg_vocabf, tolower=True) if not options.frequent: trg_vocab = set(trg_vocab.keys()) else: trg_embs = pyemblib.read(options.trg_embf, mode=options.trg_embf_mode) trg_vocab = set([k.lower() for k in trg_embs.keys()]) if options.frequent: pivots = frequentPivotTerms(src_vocab, trg_vocab, num_terms=options.num_pivots, stopwords=stopwords) else: pivots = randomPivotTerms(src_vocab, trg_vocab, num_terms=options.num_pivots, stopwords=stopwords)
# only one valid data mode for Google and BATS datasets if options.dataset in [datasets.Google, datasets.BATS ] and options.anlg_type != data_mode.String: log.writeln( '[WARNING] Invalid --analogy-type setting for %s dataset; Overriding to "%s"' % (options.dataset, data_mode.String)) options.anlg_type = data_mode.String t_sub = log.startTimer('Reading %s embeddings from %s...' % (options.embeddings_mode, options.embeddings)) separator = '\t' if options.tab_sep else ' ' (fmt, mode) = pyemblib.CLI_Formats.parse(options.embeddings_mode) embeddings = pyemblib.read(options.embeddings, format=fmt, mode=mode, separator=separator, lower_keys=options.to_lower) log.stopTimer( t_sub, 'Read {0:,} embeddings in {1}s.\n'.format(len(embeddings), '{0:.2f}')) t_sub = log.startTimer('Running analogy task on %s dataset...' % options.dataset) results = analogyTask(analogy_file, options.dataset, options.setting, options.anlg_type, embeddings, log=log, predictions_file=options.predictions_file,
exit() (mentionf, ) = args return mentionf, options mentionf, options = _cli() log.start(logfile=options.logfile, stdout_also=True) if options.tab_separated: sep = '\t' else: sep = ' ' t_sub = log.startTimer('Reading entity embeddings from %s...' % options.entity_embf, newline=False) entity_embeds = pyemblib.read(options.entity_embf, separator=sep) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(entity_embeds)) t_sub = log.startTimer('Reading context embeddings from %s...' % options.ctx_embf, newline=False) ctx_embeds = pyemblib.read(options.ctx_embf, separator=sep) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(ctx_embeds)) if options.entity_defnf: t_sub = log.startTimer('Reading word embeddings from %s...' % options.word_embf, newline=False) #word_embeds = pyemblib.read(options.word_embf)
get_lemmas=True) log.writeln('Mapped dataset info for {0:,} mentions.\n'.format( len(mention_map))) if options.wordnet_baseline_eval_predictions: wordnetFirstSenseBaseline(mentions, mention_map, options.wordnet_baseline_eval_predictions) if options.elmo_baseline_eval_predictions: log.writeln('Reading set of training lemmas from %s...' % options.training_lemmasf) training_lemmas = readTrainingLemmas(options.training_lemmasf) log.writeln('Read {0:,} lemmas.\n'.format(len(training_lemmas))) log.writeln('Reading SemCor sense embeddings from %s...' % options.semcor_embf) semcor_embeddings = pyemblib.read(options.semcor_embf) log.writeln('Read embeddings for {0:,} senses.\n'.format( len(semcor_embeddings))) log.writeln('Reading backoff predictions from %s...' % options.wordnet_baseline_input_predictions) wn_first_sense_preds = loadWSDFrameworkPredictions( options.wordnet_baseline_input_predictions) log.writeln('Read predictions for {0:,} samples.\n'.format( len(wn_first_sense_preds))) ELMoBaseline(mentions, mention_map, wn_first_sense_preds, training_lemmas, semcor_embeddings, options.elmo_baseline_eval_predictions) log.stop()
log.start(logfile=options.logfile) configlogger.writeConfig( log, [ ('Input embedding file', embf), ('Input embedding file mode', options.embedding_mode), ('Output neighbor file', options.outputf), ('Ordered vocabulary file', options.vocabf), ('Number of nearest neighbors', options.k), ('Batch size', options.batch_size), ('Number of threads', options.threads), ('Partial nearest neighbors file for resuming', options.partial_neighbors_file), ], 'k Nearest Neighbor calculation with cosine similarity') t_sub = log.startTimer('Reading embeddings from %s...' % embf) emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace') log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(emb), '{0:.2f}')) if not os.path.isfile(options.vocabf): log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf) writeNodeMap(emb, options.vocabf) else: log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf) node_map = readNodeMap(options.vocabf) # get the vocabulary in node ID order, and map index in emb_arr # to node IDs
from sklearn.manifold import MDS import numpy as np import json import pyemblib import random embedding_file = "top_10000_emb.txt" include = set(['red', 'black', 'green', 'orange', 'apple', 'king', 'queen', 'man', 'woman', 'moscow', 'russia', 'tokyo', 'japan']) embeddings = pyemblib.read(embedding_file, mode=pyemblib.Mode.Text) keys = list(embeddings.keys()) values = list(embeddings.values()) new_vals, new_keys = [], [] for i in range(len(keys)): if keys[i] in include or random.random() < 0.05: new_keys.append(keys[i]) new_vals.append(values[i]) values = np.array(new_vals) keys = new_keys mds = MDS(n_components=3) realspace = mds.fit_transform(values) f = open('3dembeddings.csv', 'w') for i in range(len(keys)): f.write(keys[i] + ',' + str(realspace[i][0]) + ',' + str(realspace[i][1]) + ',' + str(realspace[i][2]) + '\n') f.close()
('Patience', options.patience), ('Early stopping criterion', options.early_stopping), ('Max training epochs', options.max_epochs), ('Checkpoint file', options.checkpoint_path), ('Cross validation splits file', options.cross_validation_file), ('Number of folds', options.n_folds), ('Fraction of training used for dev', options.dev_size), ('Writing predictions to', options.predictions_file), ('Writing dev results to', options.dev_results_file), ('Random seed', options.random_seed), ]), ], 'WordNet classification experiment') t_sub = log.startTimer('Reading word embeddings from %s...' % options.embedding_f) embeddings = pyemblib.read(options.embedding_f) log.stopTimer(t_sub, message='Read {0:,} embeddings ({1}s).\n'.format( len(embeddings), '{0:.2f}')) log.writeln('Reading dataset from %s...' % dataset_f) ds = dataset.load(dataset_f) log.writeln('Read {0:,} samples.\n'.format(len(ds))) preprocessed = preprocess(ds, embeddings, options) if options.predictions_file: preds_stream = codecs.open(options.predictions_file, 'w', 'utf-8') runCrossValidationExperiment(preprocessed, options, preds_stream)
def testFunc(): print('testFun') wordsim = open("wordsim353_agreed.txt") print("Preprocessing. ") file_name_length = len(emb_path) last_char = emb_path[file_name_length - 1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} if (last_char == 'n'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary, replace_errors=True) elif (last_char == 't'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text, replace_errors=True) else: print("Unsupported embedding format. ") exit() print("Source: ", emb_path) result = [] humanRank = [] cosineRank = [] numMiss = 0 full_vocab = []
from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models.keyedvectors import KeyedVectors import pyemblib import gensim import os parent = os.path.abspath("../../embeddings/") path = os.path.abspath("../../embeddings/GoogleNews-vectors-negative300.bin") glove = os.path.abspath("../../embeddings/glove.840B.300d.txt") wikitext_path = os.path.abspath("../../embeddings/wiki-news-300d-1M-subword.vec") # gensim working. # google_news = KeyedVectors.load_word2vec_format(path, binary=True) glove2word2vec(glove_input_file=glove, word2vec_output_file=os.path.join(parent, "glove.840B.300d_Word2Vec_format.txt")) # pyemblib not working. # embedding = pyemblib.read(path, format='Word2Vec', mode=pyemblib.Mode.Binary) wikitext = pyemblib.read(wikitext_path, format='Word2Vec', mode=pyemblib.Mode.Text) print("Successful read.")
('Using Action oracle', options.action_oracle), ('Input predictions file', options.input_predsf), ('Pre-embedded mentions', options.pre_embedded), ]), ], title= "Entity linking (disambiguation) experiment using scikit-learn baseline algorithms" ) ## Data loading/setup entity_embeds = [] for i in range(len(options.entity_embfs)): f = options.entity_embfs[i] t_sub = log.startTimer( 'Reading set %d of entity embeddings from %s...' % (i + 1, f)) entity_embeds.append(pyemblib.read(f, lower_keys=True)) log.stopTimer(t_sub, message='Read %s embeddings ({0:.2f}s)\n' % ('{0:,}'.format(len(entity_embeds[-1])))) if options.word_vocabf: t_sub = log.startTimer('Reading word/context vocabulary from %s...' % options.word_vocabf) word_vocab = readVocab(options.word_vocabf) log.stopTimer(t_sub, message='Read %s words ({0:.2f}s)\n' % ('{0:,}'.format(len(word_vocab)))) else: word_vocab = None if options.use_ctx_embeddings:
def getEmbeddings(options, log=log, separator=' '): word_embs, term_embs, ent_embs = None, None, None word_ids, term_ids, ent_ids = None, None, None # load in embeddings if options.ent_embf: if options.ent_filterf: filter_set = readFilterSet(options.ent_filterf) else: filter_set = None t_sub = log.startTimer('Reading entity embeddings from %s...' % options.ent_embf, newline=False) ent_embs = pyemblib.read(options.ent_embf, separator=separator, replace_errors=True, filter_to=filter_set, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(ent_embs)) ent_ids = ent_embs.keys() if options.term_embf: if options.term_filterf: filter_set = readFilterSet(options.term_filterf) else: filter_set = None t_sub = log.startTimer('Reading term embeddings from %s...' % options.term_embf, newline=False) term_embs = pyemblib.read(options.term_embf, separator=separator, replace_errors=True, filter_to=filter_set, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(term_embs)) term_ids = term_embs.keys() if options.word_embf: if options.word_filterf: filter_set = readFilterSet(options.word_filterf) else: filter_set = None t_sub = log.startTimer('Reading word embeddings from %s...' % options.word_embf, newline=False) word_embs = pyemblib.read(options.word_embf, separator=separator, replace_errors=True, filter_to=filter_set, lower_keys=(not options.keep_word_case)) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(word_embs)) word_ids = word_embs.keys() # load in term/string maps if options.termmapf: t_sub = log.startTimer( 'Reading term-entity mappings from %s (separated by "%s")...' % (options.termmapf, options.term_map_sep), newline=False) term_entity_map = readTermEntityMap(options.termmapf, entity_ids=ent_ids, term_ids=term_ids, map_sep=options.term_map_sep) log.stopTimer(t_sub, message='Read mappings for %d terms ({0:.2f}s)' % len(term_entity_map)) if options.strmapf: t_sub = log.startTimer('Reading term-string mappings from %s...' % options.strmapf, newline=False) term_string_map = readTermStringMap(options.strmapf, term_ids=term_ids) log.stopTimer(t_sub, message='Read mappings for %d terms ({0:.2f}s)' % len(term_string_map)) # perform actual approximations if options.repr_method == ENTITY: emb_wrapper = EmbeddingWrapper(options.repr_method, ent_embs, indexed=True) elif options.repr_method == TERM: # rekey term embeddings new_term_embs = {} for (term_id, term_emb) in term_embs.items(): term_str = term_string_map.get(term_id, None) if term_str: new_term_embs[term_str] = term_emb emb_wrapper = EmbeddingWrapper(options.repr_method, new_term_embs, backoff_embeds=word_embs, indexed=True) elif options.repr_method == WORD: if options.term_embf: raise Exception("Honestly, I don't think this setting is used.") else: emb_wrapper = EmbeddingWrapper(options.repr_method, word_embs, indexed=True) else: raise Exception("Huh? %s" % options.repr_method) return emb_wrapper
exit() return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()} log.writeln('Writing remapped embeddings to %s...' % options.outputf) (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format) pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True) log.writeln('Done!') log.stop()
default=None) (options, args) = parser.parse_args() if (not options.inputf) or (not options.outputf) or (not options.datasetf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input embeddings file', options.inputf), ('Output embeddings file', options.outputf), ('Dataset file', options.datasetf), ], 'Embedding filtering for WordNet classification experiments') t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf) embeddings = pyemblib.read(options.inputf) log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(embeddings), '{0:.2f}' )) log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf) ds = dataset.load(options.datasetf) vocab = set() for (_, src, snk, _) in ds: vocab.add(src) vocab.add(snk) log.writeln('Found {0:,} unique words in {1:,} samples.\n'.format( len(vocab), len(ds) )) log.writeln('Filtering embeddings...')