def epoch( vectors_matrix, labels_df, new_emb_path): name = mp.current_process().name print(name, 'Starting') sys.stdout.flush() # shape [<num_inputs>,<dimensions>] rand_emb_array = [] for i in range(len(vectors_matrix)): vec = np.random.rand(len(vectors_matrix[0])) vec = vec / np.linalg.norm(vec) rand_emb_array.append(vec) print("labels shape: ", labels_df.shape) # creates the emb dict dist_emb_dict = {} for i in tqdm(range(len(labels_df))): emb_array_row = rand_emb_array[i] dist_emb_dict.update({labels_df[i]:emb_array_row}) # saves the embedding pyemblib.write(dist_emb_dict, new_emb_path, mode=pyemblib.Mode.Binary) print("Embedding saved to: " + new_emb_path) print(name, 'Exiting') return
def subset_embedding(emb_path, first_n, vocab): # Hard coding to save time. emb_format = pyemblib.Format.Word2Vec embedding = read(emb_path, emb_format, first_n) # make sure it has a valid file extension extension = os.path.basename(emb_path).split('.')[-1] if extension != "txt" and extension != "bin": print("Invalid file path. ") exit() source_name = os.path.splitext(os.path.basename(emb_path))[0] print("Source name:", source_name) # the name of the embedding to save parent = os.path.abspath(os.path.join(emb_path, "../")) check_valid_dir(parent) new_emb_path = str(os.path.join(parent, "first-" + str(first_n) + "__source--" + source_name + ".txt")) print("Writing to: ", new_emb_path) # write to text embedding file pyemblib.write(embedding, new_emb_path, mode=pyemblib.Mode.Text) return
def main(emb_path, dest_path, mode): embedding = read_embedding(emb_path) if mode == "txt": pyemblib.write(embedding, dest_path, mode=pyemblib.Mode.Text) elif mode == "bin": pyemblib.write(embedding, dest_path, mode=pyemblib.Mode.Binary) else: print("Mode (third argument) must be \"txt\" or \"bin\".")
def subset_embedding(emb_path, first_n, vocab): print("Preprocessing. ") file_name_length = len(emb_path) last_char = emb_path[file_name_length - 1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} if (last_char == 'n'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary, first_n=first_n) elif (last_char == 't'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text, first_n=first_n) else: print("Unsupported embedding format. ") exit() # make sure it has a valid file extension extension = emb_path[file_name_length - 4:file_name_length] if extension != ".txt" and extension != ".bin": print("Invalid file path. ") exit() # get the emb_path without the file extension path_no_ext = emb_path[0:file_name_length - 4] new_path = path_no_ext + "_SUBSET.txt" # write to text embedding file pyemblib.write(embedding, new_path, mode=pyemblib.Mode.Text) return
def loopflow(target_list_path): with open(target_list_path, encoding='utf-8', errors='ignore') as f: target_list = f.readlines() for i, target in enumerate(target_list): target = os.path.abspath(target) target = list(target) target.remove('\n') target = "".join(target) basename = os.path.basename(target) parent = os.path.abspath(os.path.join(target, '../')) extension = target.split('.')[-1] words, vectors = _readBin(target) lower_keys = False wordmap = Embeddings() for i in range(len(words)): if lower_keys: key = words[i].lower() else: key = words[i] wordmap[key] = vectors[i] save_name = os.path.join(parent, 'parse-error-fix_' + basename) pyemblib.write(wordmap, save_name, mode=pyemblib.Mode.Binary)
def nn(embedding_tensor, num_batches, step, batch_queue, train, loss, loss_vectors, hidden_layer, X, init, saver, model_path, new_emb_path, retrain): name = mp.current_process().name print(name, 'Starting') sys.stdout.flush() with tf.Session() as sess: # initializes all the variables that have been created sess.run(init) # list of slices which compose the new embedding embedding_slices = [] label_slices = [] # just can't be -1 batch = np.zeros((5, 5)) total_error = 0 batches_completed = 0 print("number of batches: ", num_batches) while True: batch_loss = 0 batch, slice_df = batch_queue.get() # break for halt batch # be careful not to check for np.array but for np.ndarray! if not isinstance(batch, np.ndarray): print("Found the halt batch. ") batch, slice_df = batch_queue.get() batch, slice_df = batch_queue.get() break print("Batches completed: ", batches_completed) batches_completed = batches_completed + 1 sys.stdout.flush() if retrain: sess.run(train, feed_dict={X: batch}) err_vectors = loss_vectors.eval(feed_dict={X: batch}) for j in range(len(err_vectors)): # get the loss value for the jth distance vector # in the batch err_vector = err_vectors[j] # print("errvector shape,",err_vector.shape) # convert shape from (n,1) to (1,n) err_vector = np.asarray([err_vector]) # get the sum of the loss over that distance vector loss_val = np.sum(err_vector) # add to total loss for entire vocab total_error += loss_val batch_loss += loss_val # when we put "batch" in the feed dict, it uses it # wherever there is an "X" in the definition of "loss" OR # in the definition of any tf function that "loss" calls. # err = loss.eval(feed_dict={X: batch}) # print("\tLoss:", err) with open("loss_log_20K.txt", "a") as f: f.write(str(batch_loss) + "\n") else: # slice of the output from the hidden layer hidden_out_slice = hidden_layer.eval(feed_dict={X: batch}) embedding_slices.append(hidden_out_slice) # add the slice of labels that corresponds to the batch label_slices.append(slice_df) if retrain: ''' print("Printing total loss. ") with open("loss_log_20K.txt","a") as f: f.write("Total Loss for epoch " + str(step) + ": " + str(total_error) + "\n") ''' # save_path = saver.save(sess,"../model_small.ckpt") save_path = saver.save(sess, model_path) print("Model saved in path: %s" % save_path) else: # makes dist_emb_array a 3-dimensional array dist_emb_array = np.stack(embedding_slices) # concatenates the first dimension, so dist_emb_array has # shape [<num_inputs>,<dimensions>] dist_emb_array = np.concatenate(dist_emb_array) # concatenates the list of pands Series containing the words # that correspond to the new vectors in "dist_emb_array" labels = pd.concat(label_slices) print("labels shape: ", labels.shape) print("dist_emb_array shape: ", dist_emb_array.shape) # creates the emb dict dist_emb_dict = {} for i in tqdm(range(len(labels))): emb_array_row = dist_emb_array[i] dist_emb_dict.update({labels[i]: emb_array_row}) # saves the embedding pyemblib.write(dist_emb_dict, save_path, mode=pyemblib.Mode.Text) while not batch_queue.empty(): try: batch_queue.get(timeout=0.001) except: pass print(name, 'Exiting') return
return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()} log.writeln('Writing remapped embeddings to %s...' % options.outputf) (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format) pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True) log.writeln('Done!') log.stop()
len(definitions)) log.write('Constructing entity definition representations...') entity_defn_embeds = embedDefinitions(definitions, word_embeds) #del(word_embeds) log.writeln('Embedded %d entity definitions.' % len(entity_defn_embeds)) if options.entity_dualf: dual_embeds = pyemblib.Embeddings() for (k, v) in entity_defn_embeds.items(): if k in entity_embeds: dual_embeds[k] = np.concatenate([entity_embeds[k], v]) log.writeln('Writing both versions of entity embeddings to %s...' % options.entity_dualf) pyemblib.write(dual_embeds, options.entity_dualf) log.writeln('Wrote %d dual embeddings.' % len(dual_embeds)) else: entity_defn_embeds = None if options.stringsf: t_sub = log.startTimer('Reading preferred strings from %s...' % options.stringsf) preferred_strings = readPreferredStrings(options.stringsf) log.stopTimer(t_sub, message='Read %d strings ({0:.2f}s)' % len(preferred_strings)) else: preferred_strings = None if options.polysemyf:
('Number of validated pivots', len(validated_pivots)), ('Checkpoint file', options.checkpointf), ('Model settings', OrderedDict([ ('Random seed', options.random_seed), ('Number of layers', options.num_layers), ('Activation', options.activation), ('Number of folds', options.num_folds), ('Batch size', options.batch_size), ])) ]) log.writeln('Training manifold mapper...') mapped_embs = crossfoldTrain(src_embs, trg_embs, validated_pivots, options.num_folds, options.activation, options.num_layers, batch_size=options.batch_size, checkpoint_file=options.checkpointf, random_seed=options.random_seed) if options.outf: log.writeln('Writing mapped embeddings to %s' % options.outf) pyemblib.write(mapped_embs, options.outf, verbose=True, mode=options.out_embf_mode)
print('testFunc') print("Preprocessing. ") file_name_length = len(emb_path) last_char = emb_path[file_name_length - 1] # Decide if it's a binary or text embedding file, and read in # the embedding as a dict object, where the keys are the tokens # (strings), and the values are the components of the corresponding # vectors (floats). embedding = {} if (last_char == 'n'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary, replace_errors=True) elif (last_char == 't'): embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text, replace_errors=True) else: print("Unsupported embedding format. ") exit() print("Source: ", emb_path) parent = os.path.abspath(emb_path + "/../") source_name = os.path.splitext(os.path.basename(emb_path))[0] dest_path = os.path.join(parent, source_name + "_clean.bin") pyemblib.write(embedding, dest_path, mode=pyemblib.Mode.Binary)
def epoch(embedding_tensor,num_batches,step,batch_queue,train, loss,loss_vectors,hidden_layer,X,init,saver,model_path, new_emb_path,retrain,num_processes): name = mp.current_process().name print(name, 'Starting') sys.stdout.flush() with tf.Session() as sess: # initializes all the variables that have been created sess.run(init) # list of slices which compose the new embedding embedding_slices = [] label_slices = [] # just can't be -1 batch = np.zeros((5,5)) total_error = 0 batches_completed = 0 print("number of batches: ", num_batches) halts = 0 while True: batch_loss = 0 print("about to try to grab") sys.stdout.flush() batch,slice_df = batch_queue.get() # break for halt batch # be careful not to check for np.array but for np.ndarray! if not isinstance(batch, np.ndarray): print("Found a halt batch. ") halts += 1 if halts >= num_processes: break else: # skip to next iteration of while loop continue print("Batches grabbed: ", batches_completed) batches_completed = batches_completed + 1 sys.stdout.flush() embedding_slices.append(batch) # add the slice of labels that corresponds to the batch label_slices.append(slice_df) # makes dist_emb_array a 3-dimensional array dist_emb_array = np.stack(embedding_slices) # concatenates the first dimension, so dist_emb_array has # shape [<num_inputs>,<dimensions>] dist_emb_array = np.concatenate(dist_emb_array) # concatenates the list of pands Series containing the words # that correspond to the new vectors in "dist_emb_array" labels = pd.concat(label_slices) print("labels shape: ", labels.shape) print("dist_emb_array shape: ", dist_emb_array.shape) # creates the emb dict dist_emb_dict = {} for i in tqdm(range(len(labels))): emb_array_row = dist_emb_array[i] dist_emb_dict.update({labels[i]:emb_array_row}) # saves the embedding pyemblib.write(dist_emb_dict, new_emb_path, mode=pyemblib.Mode.Text) while not batch_queue.empty(): try: batch_queue.get(timeout=0.001) except: pass print(name, 'Exiting') return
def genflow(emb_path, emb_format, first_n): print_sleep_interval = 1 print("checkpoint 1") check_valid_file(emb_path) sys.stdout.flush() source_name = os.path.splitext(os.path.basename(emb_path))[0] print("Source name:", source_name) sys.stdout.flush() # take the first n most frequent word vectors for a subset # set to 0 to take entire embedding first_n = 0 # Preprocess. print("About to preprocess. ") sys.stdout.flush() vectors_matrix, label_df = process_embedding(emb_path, emb_format, first_n, None) print("Done preprocessing. ") sys.stdout.flush() # We get the dimensions of the input dataset. shape = vectors_matrix.shape print("Shape of embedding matrix: ", shape) time.sleep(print_sleep_interval) sys.stdout.flush() # number of rows in the embedding num_inputs = shape[0] num_outputs = num_inputs # dimensionality of the embedding file dim = shape[1] #=================================================================== now = datetime.datetime.now() timestamp = now.strftime("%Y-%m-%d-%H%M") # The name of the embedding to save. parent = os.path.abspath(os.path.join(emb_path, "../")) check_valid_dir(parent) print("Is anything happening here?") sys.stdout.flush() transforms = get_config(dim) print("Got transforms. ") sys.stdout.flush() output_embedding_paths = [] for i, transform in tqdm(enumerate(transforms)): func = transform[0] arglist = transform[1] new_emb_path = str( os.path.join( parent, "affine-" + str(i) + "__source--" + source_name + "__" + "time--" + timestamp + ".bin")) sys.stdout.flush() output_embedding_paths.append(new_emb_path) print("About to start generation.") sys.stdout.flush() transformed_vectors = func(vectors_matrix, arglist) # shape [<num_inputs>,<dimensions>] print("labels shape: ", label_df.shape) sys.stdout.flush() # creates the emb dict dist_emb_dict = {} for i in tqdm(range(len(label_df))): emb_array_row = transformed_vectors[i] dist_emb_dict.update({label_df[i]: emb_array_row}) sys.stdout.flush() print("Embedding dict created. ") sys.stdout.flush() # saves the embedding pyemblib.write(dist_emb_dict, new_emb_path, mode=pyemblib.Mode.Binary) print("Embedding saved to: " + new_emb_path) # Write the output embedding names to a text file. outputlist_name = "affine-outputlist__source--" + source_name + "__time--" + timestamp + ".txt" outputlist_path = os.path.join(parent, outputlist_name) with open(outputlist_path, 'w') as f: for path in output_embedding_paths: f.write(path + "\n") return
log.writeln('Wrote vocabulary to {0}.\nMax character length: {1:,}\n'.format( config['SemCor']['Vocab'], max_char_len )) log.writeln('OVERRIDING max_char_len to 50!\n') max_char_len = 50 bilm_params = ELMoParams() bilm_params.options_file = config['ELMo']['Options'] bilm_params.weights_file = config['ELMo']['Weights'] bilm_params.vocab_file = config['SemCor']['Vocab'] bilm_params.max_char_len = max_char_len t_sub = log.startTimer('Getting ELMo representations for SemCor senses...') sense_embeddings = getELMoRepresentations( sentences_words, sentences_instances, semcor_labels, unique_sense_IDs, bilm_params ) log.stopTimer(t_sub, message='Calculated embeddings for {0:,} senses in {1}s.\n'.format( len(sense_embeddings), '{0:.2f}' )) t_sub = log.startTimer('Writing sense embeddings to %s...' % config['SemCor']['Embeddings']) pyemblib.write(sense_embeddings, config['SemCor']['Embeddings']) log.stopTimer(t_sub, message='Completed writing embeddings in {0:.2f}s.\n') log.stop()
], 'Embedding filtering for WordNet classification experiments') t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf) embeddings = pyemblib.read(options.inputf) log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(embeddings), '{0:.2f}' )) log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf) ds = dataset.load(options.datasetf) vocab = set() for (_, src, snk, _) in ds: vocab.add(src) vocab.add(snk) log.writeln('Found {0:,} unique words in {1:,} samples.\n'.format( len(vocab), len(ds) )) log.writeln('Filtering embeddings...') filtered = pyemblib.Embeddings() for (k,v) in embeddings.items(): if k in vocab: filtered[k] = v log.writeln('Reduced to {0:,} embeddings.\n'.format(len(filtered))) log.writeln('Writing filtered embeddings to %s...' % options.outputf) pyemblib.write(filtered, options.outputf, verbose=True) log.writeln('Done.\n') log.stop()