Ejemplo n.º 1
0
def epoch(  vectors_matrix,
            labels_df,
            new_emb_path):
 
    name = mp.current_process().name
    print(name, 'Starting')
    sys.stdout.flush()

    # shape [<num_inputs>,<dimensions>]
    rand_emb_array = []

    for i in range(len(vectors_matrix)):
        vec = np.random.rand(len(vectors_matrix[0]))
        vec = vec / np.linalg.norm(vec)
        rand_emb_array.append(vec)

    print("labels shape: ", labels_df.shape)
    
    # creates the emb dict
    dist_emb_dict = {}
    for i in tqdm(range(len(labels_df))):
        emb_array_row = rand_emb_array[i]
        dist_emb_dict.update({labels_df[i]:emb_array_row})

    # saves the embedding
    pyemblib.write(dist_emb_dict, 
                   new_emb_path, 
                   mode=pyemblib.Mode.Binary)

    print("Embedding saved to: " + new_emb_path)
 
    print(name, 'Exiting')
    return
Ejemplo n.º 2
0
def subset_embedding(emb_path, first_n, vocab):
    
    # Hard coding to save time. 
    emb_format = pyemblib.Format.Word2Vec
    embedding = read(emb_path, emb_format, first_n)        

    # make sure it has a valid file extension
    extension = os.path.basename(emb_path).split('.')[-1]
    if extension != "txt" and extension != "bin":
        print("Invalid file path. ")
        exit()
  
    source_name = os.path.splitext(os.path.basename(emb_path))[0]
    print("Source name:", source_name)
 
    # the name of the embedding to save
    parent = os.path.abspath(os.path.join(emb_path, "../"))
    check_valid_dir(parent)
    new_emb_path =  str(os.path.join(parent, "first-" + str(first_n) + "__source--" + source_name + ".txt"))
    print("Writing to: ", new_emb_path)
 
    # write to text embedding file
    pyemblib.write(embedding, 
                   new_emb_path, 
                   mode=pyemblib.Mode.Text)
    
    return 
def main(emb_path, dest_path, mode):

    embedding = read_embedding(emb_path)
    if mode == "txt":
        pyemblib.write(embedding, dest_path, mode=pyemblib.Mode.Text)
    elif mode == "bin":
        pyemblib.write(embedding, dest_path, mode=pyemblib.Mode.Binary)
    else:
        print("Mode (third argument) must be \"txt\" or \"bin\".")
def subset_embedding(emb_path, first_n, vocab):

    print("Preprocessing. ")
    file_name_length = len(emb_path)
    last_char = emb_path[file_name_length - 1]

    # Decide if it's a binary or text embedding file, and read in
    # the embedding as a dict object, where the keys are the tokens
    # (strings), and the values are the components of the corresponding
    # vectors (floats).
    embedding = {}
    if (last_char == 'n'):
        embedding = pyemblib.read(emb_path,
                                  mode=pyemblib.Mode.Binary,
                                  first_n=first_n)
    elif (last_char == 't'):
        embedding = pyemblib.read(emb_path,
                                  mode=pyemblib.Mode.Text,
                                  first_n=first_n)
    else:
        print("Unsupported embedding format. ")
        exit()

    # make sure it has a valid file extension
    extension = emb_path[file_name_length - 4:file_name_length]
    if extension != ".txt" and extension != ".bin":
        print("Invalid file path. ")
        exit()

    # get the emb_path without the file extension
    path_no_ext = emb_path[0:file_name_length - 4]
    new_path = path_no_ext + "_SUBSET.txt"

    # write to text embedding file
    pyemblib.write(embedding, new_path, mode=pyemblib.Mode.Text)

    return
Ejemplo n.º 5
0
def loopflow(target_list_path):
    with open(target_list_path, encoding='utf-8', errors='ignore') as f:
        target_list = f.readlines()

    for i, target in enumerate(target_list):
        target = os.path.abspath(target)
        target = list(target)
        target.remove('\n')
        target = "".join(target)
        basename = os.path.basename(target)
        parent = os.path.abspath(os.path.join(target, '../'))
        extension = target.split('.')[-1]

        words, vectors = _readBin(target)

        lower_keys = False
        wordmap = Embeddings()
        for i in range(len(words)):
            if lower_keys: key = words[i].lower()
            else: key = words[i]
            wordmap[key] = vectors[i]

        save_name = os.path.join(parent, 'parse-error-fix_' + basename)
        pyemblib.write(wordmap, save_name, mode=pyemblib.Mode.Binary)
Ejemplo n.º 6
0
def nn(embedding_tensor, num_batches, step, batch_queue, train, loss,
       loss_vectors, hidden_layer, X, init, saver, model_path, new_emb_path,
       retrain):

    name = mp.current_process().name
    print(name, 'Starting')
    sys.stdout.flush()
    with tf.Session() as sess:

        # initializes all the variables that have been created
        sess.run(init)

        # list of slices which compose the new embedding
        embedding_slices = []
        label_slices = []

        # just can't be -1
        batch = np.zeros((5, 5))
        total_error = 0
        batches_completed = 0
        print("number of batches: ", num_batches)

        while True:

            batch_loss = 0
            batch, slice_df = batch_queue.get()

            # break for halt batch
            # be careful not to check for np.array but for np.ndarray!
            if not isinstance(batch, np.ndarray):
                print("Found the halt batch. ")
                batch, slice_df = batch_queue.get()
                batch, slice_df = batch_queue.get()
                break
            print("Batches completed: ", batches_completed)
            batches_completed = batches_completed + 1
            sys.stdout.flush()

            if retrain:
                sess.run(train, feed_dict={X: batch})
                err_vectors = loss_vectors.eval(feed_dict={X: batch})
                for j in range(len(err_vectors)):
                    # get the loss value for the jth distance vector
                    # in the batch
                    err_vector = err_vectors[j]
                    # print("errvector shape,",err_vector.shape)

                    # convert shape from (n,1) to (1,n)
                    err_vector = np.asarray([err_vector])

                    # get the sum of the loss over that distance vector
                    loss_val = np.sum(err_vector)

                    # add to total loss for entire vocab
                    total_error += loss_val
                    batch_loss += loss_val

                # when we put "batch" in the feed dict, it uses it
                # wherever there is an "X" in the definition of "loss" OR
                # in the definition of any tf function that "loss" calls.
                # err = loss.eval(feed_dict={X: batch})
                # print("\tLoss:", err)

                with open("loss_log_20K.txt", "a") as f:
                    f.write(str(batch_loss) + "\n")
            else:
                # slice of the output from the hidden layer
                hidden_out_slice = hidden_layer.eval(feed_dict={X: batch})
                embedding_slices.append(hidden_out_slice)

                # add the slice of labels that corresponds to the batch
                label_slices.append(slice_df)

        if retrain:
            ''' 
            print("Printing total loss. ")
            with open("loss_log_20K.txt","a") as f:
                f.write("Total Loss for epoch " 
                        + str(step) + ": " + str(total_error) + "\n")
            '''

            # save_path = saver.save(sess,"../model_small.ckpt")
            save_path = saver.save(sess, model_path)
            print("Model saved in path: %s" % save_path)
        else:

            # makes dist_emb_array a 3-dimensional array
            dist_emb_array = np.stack(embedding_slices)

            # concatenates the first dimension, so dist_emb_array has
            # shape [<num_inputs>,<dimensions>]
            dist_emb_array = np.concatenate(dist_emb_array)

            # concatenates the list of pands Series containing the words
            # that correspond to the new vectors in "dist_emb_array"
            labels = pd.concat(label_slices)
            print("labels shape: ", labels.shape)
            print("dist_emb_array shape: ", dist_emb_array.shape)

            # creates the emb dict
            dist_emb_dict = {}
            for i in tqdm(range(len(labels))):
                emb_array_row = dist_emb_array[i]
                dist_emb_dict.update({labels[i]: emb_array_row})

            # saves the embedding
            pyemblib.write(dist_emb_dict, save_path, mode=pyemblib.Mode.Text)

    while not batch_queue.empty():
        try:
            batch_queue.get(timeout=0.001)
        except:
            pass

    print(name, 'Exiting')
    return
Ejemplo n.º 7
0
        return options

    options = _cli()

    log.start(options.logfile)
    log.writeConfig([
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')

    log.stop()
Ejemplo n.º 8
0
                      len(definitions))

        log.write('Constructing entity definition representations...')
        entity_defn_embeds = embedDefinitions(definitions, word_embeds)
        #del(word_embeds)
        log.writeln('Embedded %d entity definitions.' %
                    len(entity_defn_embeds))

        if options.entity_dualf:
            dual_embeds = pyemblib.Embeddings()
            for (k, v) in entity_defn_embeds.items():
                if k in entity_embeds:
                    dual_embeds[k] = np.concatenate([entity_embeds[k], v])
            log.writeln('Writing both versions of entity embeddings to %s...' %
                        options.entity_dualf)
            pyemblib.write(dual_embeds, options.entity_dualf)
            log.writeln('Wrote %d dual embeddings.' % len(dual_embeds))
    else:
        entity_defn_embeds = None

    if options.stringsf:
        t_sub = log.startTimer('Reading preferred strings from %s...' %
                               options.stringsf)
        preferred_strings = readPreferredStrings(options.stringsf)
        log.stopTimer(t_sub,
                      message='Read %d strings ({0:.2f}s)' %
                      len(preferred_strings))
    else:
        preferred_strings = None

    if options.polysemyf:
Ejemplo n.º 9
0
                                 ('Number of validated pivots',
                                  len(validated_pivots)),
                                 ('Checkpoint file', options.checkpointf),
                                 ('Model settings',
                                  OrderedDict([
                                      ('Random seed', options.random_seed),
                                      ('Number of layers', options.num_layers),
                                      ('Activation', options.activation),
                                      ('Number of folds', options.num_folds),
                                      ('Batch size', options.batch_size),
                                  ]))
                             ])

    log.writeln('Training manifold mapper...')
    mapped_embs = crossfoldTrain(src_embs,
                                 trg_embs,
                                 validated_pivots,
                                 options.num_folds,
                                 options.activation,
                                 options.num_layers,
                                 batch_size=options.batch_size,
                                 checkpoint_file=options.checkpointf,
                                 random_seed=options.random_seed)

    if options.outf:
        log.writeln('Writing mapped embeddings to %s' % options.outf)
        pyemblib.write(mapped_embs,
                       options.outf,
                       verbose=True,
                       mode=options.out_embf_mode)
Ejemplo n.º 10
0
    print('testFunc')


print("Preprocessing. ")
file_name_length = len(emb_path)
last_char = emb_path[file_name_length - 1]

# Decide if it's a binary or text embedding file, and read in
# the embedding as a dict object, where the keys are the tokens
# (strings), and the values are the components of the corresponding
# vectors (floats).
embedding = {}
if (last_char == 'n'):
    embedding = pyemblib.read(emb_path,
                              mode=pyemblib.Mode.Binary,
                              replace_errors=True)
elif (last_char == 't'):
    embedding = pyemblib.read(emb_path,
                              mode=pyemblib.Mode.Text,
                              replace_errors=True)
else:
    print("Unsupported embedding format. ")
    exit()

print("Source: ", emb_path)

parent = os.path.abspath(emb_path + "/../")
source_name = os.path.splitext(os.path.basename(emb_path))[0]
dest_path = os.path.join(parent, source_name + "_clean.bin")
pyemblib.write(embedding, dest_path, mode=pyemblib.Mode.Binary)
def epoch(embedding_tensor,num_batches,step,batch_queue,train,
          loss,loss_vectors,hidden_layer,X,init,saver,model_path,
          new_emb_path,retrain,num_processes):
 
    name = mp.current_process().name
    print(name, 'Starting')
    sys.stdout.flush()
    with tf.Session() as sess:
         
        # initializes all the variables that have been created
        sess.run(init)
        
        # list of slices which compose the new embedding
        embedding_slices = []
        label_slices = []

        # just can't be -1
        batch = np.zeros((5,5))
        total_error = 0
        batches_completed = 0
        print("number of batches: ", num_batches)
        halts = 0        


        while True:

            batch_loss = 0
            print("about to try to grab")
            sys.stdout.flush()
            batch,slice_df = batch_queue.get()
            
            # break for halt batch
            # be careful not to check for np.array but for np.ndarray!
            if not isinstance(batch, np.ndarray):
                print("Found a halt batch. ")
                halts += 1
                if halts >= num_processes:
                    break
                else:
                    # skip to next iteration of while loop
                    continue
            
            print("Batches grabbed: ", batches_completed) 
            batches_completed = batches_completed + 1
            sys.stdout.flush()

            embedding_slices.append(batch)

            # add the slice of labels that corresponds to the batch
            label_slices.append(slice_df)

        # makes dist_emb_array a 3-dimensional array 
        dist_emb_array = np.stack(embedding_slices)
        
        # concatenates the first dimension, so dist_emb_array has 
        # shape [<num_inputs>,<dimensions>]
        dist_emb_array = np.concatenate(dist_emb_array)

        # concatenates the list of pands Series containing the words
        # that correspond to the new vectors in "dist_emb_array"
        labels = pd.concat(label_slices)
        print("labels shape: ", labels.shape)
        print("dist_emb_array shape: ", dist_emb_array.shape)
        
        # creates the emb dict
        dist_emb_dict = {}
        for i in tqdm(range(len(labels))):
            emb_array_row = dist_emb_array[i]
            dist_emb_dict.update({labels[i]:emb_array_row})

        # saves the embedding
        pyemblib.write(dist_emb_dict, 
                       new_emb_path, 
                       mode=pyemblib.Mode.Text)

    while not batch_queue.empty():
        try:
            batch_queue.get(timeout=0.001)
        except:
            pass
 
    print(name, 'Exiting')
    return
Ejemplo n.º 12
0
def genflow(emb_path, emb_format, first_n):

    print_sleep_interval = 1
    print("checkpoint 1")
    check_valid_file(emb_path)
    sys.stdout.flush()

    source_name = os.path.splitext(os.path.basename(emb_path))[0]
    print("Source name:", source_name)
    sys.stdout.flush()

    # take the first n most frequent word vectors for a subset
    # set to 0 to take entire embedding
    first_n = 0

    # Preprocess.
    print("About to preprocess. ")
    sys.stdout.flush()
    vectors_matrix, label_df = process_embedding(emb_path, emb_format, first_n,
                                                 None)
    print("Done preprocessing. ")
    sys.stdout.flush()
    # We get the dimensions of the input dataset.
    shape = vectors_matrix.shape
    print("Shape of embedding matrix: ", shape)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # number of rows in the embedding
    num_inputs = shape[0]
    num_outputs = num_inputs

    # dimensionality of the embedding file
    dim = shape[1]

    #===================================================================

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y-%m-%d-%H%M")

    # The name of the embedding to save.
    parent = os.path.abspath(os.path.join(emb_path, "../"))
    check_valid_dir(parent)

    print("Is anything happening here?")
    sys.stdout.flush()
    transforms = get_config(dim)
    print("Got transforms. ")
    sys.stdout.flush()

    output_embedding_paths = []

    for i, transform in tqdm(enumerate(transforms)):

        func = transform[0]
        arglist = transform[1]

        new_emb_path = str(
            os.path.join(
                parent, "affine-" + str(i) + "__source--" + source_name +
                "__" + "time--" + timestamp + ".bin"))
        sys.stdout.flush()
        output_embedding_paths.append(new_emb_path)

        print("About to start generation.")
        sys.stdout.flush()
        transformed_vectors = func(vectors_matrix, arglist)

        # shape [<num_inputs>,<dimensions>]
        print("labels shape: ", label_df.shape)
        sys.stdout.flush()

        # creates the emb dict
        dist_emb_dict = {}
        for i in tqdm(range(len(label_df))):
            emb_array_row = transformed_vectors[i]
            dist_emb_dict.update({label_df[i]: emb_array_row})
            sys.stdout.flush()

        print("Embedding dict created. ")
        sys.stdout.flush()

        # saves the embedding
        pyemblib.write(dist_emb_dict, new_emb_path, mode=pyemblib.Mode.Binary)

        print("Embedding saved to: " + new_emb_path)

    # Write the output embedding names to a text file.
    outputlist_name = "affine-outputlist__source--" + source_name + "__time--" + timestamp + ".txt"
    outputlist_path = os.path.join(parent, outputlist_name)
    with open(outputlist_path, 'w') as f:
        for path in output_embedding_paths:
            f.write(path + "\n")

    return
    log.writeln('Wrote vocabulary to {0}.\nMax character length: {1:,}\n'.format(
        config['SemCor']['Vocab'], max_char_len
    ))

    log.writeln('OVERRIDING max_char_len to 50!\n')
    max_char_len = 50

    bilm_params = ELMoParams()
    bilm_params.options_file = config['ELMo']['Options']
    bilm_params.weights_file = config['ELMo']['Weights']
    bilm_params.vocab_file = config['SemCor']['Vocab']
    bilm_params.max_char_len = max_char_len

    t_sub = log.startTimer('Getting ELMo representations for SemCor senses...')
    sense_embeddings = getELMoRepresentations(
        sentences_words,
        sentences_instances,
        semcor_labels,
        unique_sense_IDs,
        bilm_params
    )
    log.stopTimer(t_sub, message='Calculated embeddings for {0:,} senses in {1}s.\n'.format(
        len(sense_embeddings), '{0:.2f}'
    ))

    t_sub = log.startTimer('Writing sense embeddings to %s...' % config['SemCor']['Embeddings'])
    pyemblib.write(sense_embeddings, config['SemCor']['Embeddings'])
    log.stopTimer(t_sub, message='Completed writing embeddings in {0:.2f}s.\n')
    
    log.stop()
    ], 'Embedding filtering for WordNet classification experiments')

    t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf)
    embeddings = pyemblib.read(options.inputf)
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(
        len(embeddings), '{0:.2f}'
    ))

    log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf)
    ds = dataset.load(options.datasetf)
    vocab = set()
    for (_, src, snk, _) in ds:
        vocab.add(src)
        vocab.add(snk)
    log.writeln('Found {0:,} unique words in {1:,} samples.\n'.format(
        len(vocab), len(ds)
    ))

    log.writeln('Filtering embeddings...')
    filtered = pyemblib.Embeddings()
    for (k,v) in embeddings.items():
        if k in vocab:
            filtered[k] = v
    log.writeln('Reduced to {0:,} embeddings.\n'.format(len(filtered)))

    log.writeln('Writing filtered embeddings to %s...' % options.outputf)
    pyemblib.write(filtered, options.outputf, verbose=True)
    log.writeln('Done.\n')

    log.stop()