Exemple #1
0
def genflow(emb_path, emb_format, first_n):

    print_sleep_interval = 1 
    print("checkpoint 1")
    check_valid_file(emb_path)

    source_name = os.path.splitext(os.path.basename(emb_path))[0]
    print("Source name:", source_name)

    # take the first n most frequent word vectors for a subset
    # set to 0 to take entire embedding
    first_n = 0

    # Preprocess. 
    vectors_matrix,label_df = process_embedding(emb_path,
                                                emb_format, 
                                                first_n,
                                                None)

    # We get the dimensions of the input dataset. 
    shape = vectors_matrix.shape
    print("Shape of embedding matrix: ", shape)
    time.sleep(print_sleep_interval) 
    sys.stdout.flush()

    # number of rows in the embedding 
    num_inputs = shape[0]
    num_outputs = num_inputs 

    # dimensionality of the embedding file
    num_hidden = shape[1]
 
    #===================================================================

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y-%m-%d-%H%M")
    
    # the name of the embedding to save
    parent = os.path.abspath(os.path.join(emb_path, "../"))
    check_valid_dir(parent)
    new_emb_path =  str(os.path.join(parent, "random__source--" + source_name 
                    + "__" + timestamp + ".bin"))
    print("Writing to: ", new_emb_path)

    # RUN THE TRAINING PROCESS
    eval_process = mp.Process(name="eval",
                               target=epoch,
                               args=(vectors_matrix,
                                     label_df,
                                     new_emb_path))

    eval_process.start()    
    eval_process.join()

    return
def neighborflow(emb_path, model_path, batch_size, epochs, learning_rate,
                 keep_prob, num_processes):

    print_sleep_interval = 1

    model_index_path = model_path + ".index"

    retrain = True

    check_valid_file(emb_path)
    if os.path.isfile(model_index_path):

        print("There is already a model saved with this name. ")
        time.sleep(print_sleep_interval)
        sys.stdout.flush()
        retrain = False

    # take the first $n$ most frequent word vectors for a subset
    # set to 0 to take entire embedding
    first_n = 0

    vectors_matrix, label_df = process_embedding(emb_path, first_n, None)

    # We get the dimensions of the input dataset.
    shape = vectors_matrix.shape
    print("Shape of embedding matrix: ", shape)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # number of rows in the embedding
    num_inputs = shape[0]
    num_outputs = num_inputs

    # dimensionality of the embedding file
    num_hidden = shape[1]

    print("Learning rate is: ", learning_rate)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # probability of outputting nonzero value in dropout layer. So the
    # input to the dropout layer goes to zero 1 - keep_prob of the time
    print("Dropout layer keep_prob is: ", keep_prob)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # HYPERPARAMETERS
    num_batches = num_inputs // batch_size  # floor division
    print("Defining hyperparameters: ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    print("Epochs: ", epochs)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    print("Batch size: ", batch_size)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    print("Number of batches: ", num_batches)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # clears the default graph stack
    tf.reset_default_graph()

    # PLACEHOLDER
    # "tf.float32" just means the data type is an integer. The shape is
    # in the form [<columns>,<rows>], and "None" means it can be any
    # value. So this placeholder can have any number of rows, and must
    # have "num_inputs" columns.
    print("Initializing placeholder. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    X = tf.placeholder(tf.float32, shape=[None, num_inputs])

    # WEIGHTS
    print("Initializing weights. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    # we use a variance scaling initializer so that it is capable of
    # adapting its scale to the shape of the weight tensors.
    initializer = tf.variance_scaling_initializer()
    input_weights = tf.Variable(initializer([num_inputs, num_hidden]),
                                dtype=tf.float32)
    output_weights = tf.Variable(initializer([num_hidden, num_outputs]),
                                 dtype=tf.float32)

    # BIAS
    input_bias = tf.Variable(tf.zeros(num_hidden))
    output_bias = tf.Variable(tf.zeros(num_outputs))

    # ACTIVATION
    act_func = tf.nn.relu

    print("Initializing layers and defining loss function. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    #===================================================================

    # LAYERS
    # the argument of act_func is a Tensor, and the variable
    # "hidden_layer" itself is also a Tensor. This hidden layer is just
    # going to compute the element-wise relu

    hidden_layer = act_func(tf.matmul(X, input_weights) + input_bias)

    # With probability keep_prob, outputs the input element scaled up
    # by 1 / keep_prob, otherwise outputs 0. The scaling is so that the
    # expected sum is unchanged.
    dropout_layer = tf.nn.dropout(hidden_layer, keep_prob=keep_prob)
    output_layer = tf.matmul(dropout_layer, output_weights) + output_bias

    # We define our loss function, minimize MSE
    loss_vectors = tf.abs(output_layer - X)
    reduce_mean = tf.reduce_mean(X)
    loss = tf.reduce_mean(tf.abs(output_layer - X))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train = optimizer.minimize(loss)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    # UNIT NORM THE EMBEDDING
    print("Unit norming the embedding. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    norms_matrix = np.linalg.norm(vectors_matrix, axis=1)
    norms_matrix[norms_matrix == 0] = 1
    vectors_matrix = vectors_matrix / np.expand_dims(norms_matrix, -1)
    print(vectors_matrix.shape)

    # we read the numpy array "vectors_matrix" into tf as a Tensor
    embedding_tensor = tf.constant(vectors_matrix)
    print("shape of emb_tens is: ", embedding_tensor.get_shape().as_list())
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    embedding_unshuffled = embedding_tensor
    emb_transpose_unshuf = tf.transpose(embedding_unshuffled)
    emb_transpose_unshuf = tf.cast(emb_transpose_unshuf, tf.float32)
    emb_transpose = tf.transpose(embedding_tensor)
    emb_transpose = tf.cast(emb_transpose, tf.float32)

    #===================================================================

    with open("loss_log_20K.txt", "a") as f:
        f.write("\n")
        f.write("=====================================================")
        f.write("\n")

    # this is where we'll add the dataset shuffler
    tf.random_shuffle(embedding_tensor)

    if retrain:

        for step in tqdm(range(epochs)):
            print("this is the ", step, "th epoch.")

            # we instantiate the queue
            seed_queue = mp.Queue()

            mananger = mp.Manager()
            batch_queue = mananger.Queue()

            # So we need each Process to take from an input queue, and
            # to output to an output queue. All 3 batch generation
            # prcoesses will read from the same input queue, and what
            # they will be reading is just an integer which corresponds
            # to an iteration
            for iteration in tqdm(range(num_batches)):
                seed_queue.put(iteration)

            # put in "p" halt seeds to tell the processes when to end
            for i in range(3):
                seed_queue.put(-1)

            new_emb_path = ""

            # CREATE MATRIXMULT PROCESSES
            batch_args = (embedding_tensor, emb_transpose, label_df,
                          batch_size, seed_queue, batch_queue)
            print("About to start the batch processes. ")
            allprocs = [
                mkproc(next_batch, batch_args) for x in range(num_processes)
            ]

            # RUN THE TRAINING PROCESS
            train_process = mp.Process(
                name="train",
                target=epoch,
                args=(embedding_tensor, num_batches, step, batch_queue, train,
                      loss, loss_vectors, hidden_layer, X, init, saver,
                      model_path, new_emb_path, retrain))
            train_process.start()

            print("queue is full. ")

            # join the processes, i.e. end them
            for process in allprocs:
                process.terminate()

            # join the processes, i.e. end them
            for process in allprocs:
                process.join()

            print("batch generation functions joined. ")

            train_process.join()

            print("train joined. ")

    #===================================================================

    # program hangs when I try to run from saved model
    ''' 
    # Later, launch the model, use the saver to restore variables from 
    # disk, and do some work with the model.
    with tf.Session() as sess:
      
        # Restore variables from disk.
        saver.restore(sess, model_path)
        print("Model restored.")
        
    # Check the values of the variables
    print(embedding_tensor.shape)

    # hidden_out = hidden_layer.eval(feed_dict={X: })
    # for row in hidden_out:
        # print(row) 
    '''

    eval_batch_size = 100

    # HYPERPARAMETERS
    eval_num_batches = num_inputs // eval_batch_size  # floor division
    print("Defining hyperparameters: ")
    print("Eval batch size: ", eval_batch_size)
    print("Number of batches: ", eval_num_batches)

    # we instantiate the queue
    seed2_queue = mp.Queue()
    batch2_queue = mp.Queue()

    # So we need each Process to take from an input queue, and
    # to output to an output queue. All 3 batch generation
    # prcoesses will read from the same input queue, and what
    # they will be reading is just an integer which corresponds
    # to an iteration
    for iteration in tqdm(range(eval_num_batches)):
        seed2_queue.put(iteration)

    print("seed queue size: ", seed2_queue.qsize())

    # CREATE MATRIXMULT PROCESSES
    batch_args = (embedding_unshuffled, emb_transpose_unshuf, label_df,
                  eval_batch_size, seed2_queue, batch2_queue)
    print("About to start the batch processes. ")
    allprocs = [mkproc(next_batch, batch_args) for x in range(num_processes)]

    # the name of the embedding to save
    # something like "~/<path>/steve.txt"
    new_emb_path = "/homes/3/whitaker.213/eleven_embedding.txt"

    retrain = False

    # RUN THE TRAINING PROCESS
    eval_process = mp.Process(name="eval",
                              target=epoch,
                              args=(embedding_unshuffled, eval_num_batches,
                                    step, batch2_queue, train, loss,
                                    loss_vectors, hidden_layer, X, init, saver,
                                    model_path, new_emb_path, retrain))
    eval_process.start()

    print("queue is full. ")

    # join the processes, i.e. end them
    for process in allprocs:
        process.terminate()

    # join the processes, i.e. end them
    for process in allprocs:
        process.join()

    eval_process.join()

    return
def saveflow(emb_path, first_n):

    check_valid_file(emb_path)

    subset_embedding(emb_path, first_n, None)
Exemple #4
0
def trainflow(emb_path, batch_size, epochs, learning_rate, keep_prob,
              num_processes):

    # Pandas behaves funny when batch_size of 1 is used.
    assert batch_size > 1

    emb_format = pyemblib.Format.Word2Vec
    print_sleep_interval = 0.5
    source_name = os.path.splitext(os.path.basename(emb_path))[0]
    print("Source name:", source_name)
    sys.stdout.flush()

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y-%m-%d-%H%M")

    # The name of the embedding to save.
    parent = os.path.abspath(os.path.join(emb_path, "../"))
    check_valid_dir(parent)

    model_path = "../AE_models/" + source_name + ".ckpt"

    # take the first n most frequent word vectors for a subset
    # set to 0 to take entire embedding
    first_n = 0

    model_index_path = model_path + ".index"

    new_emb_path = str(
        os.path.join(
            parent, "distAE-" + "__source--" + source_name + "__" + "time--" +
            timestamp + ".bin"))

    retrain = True

    check_valid_file(emb_path)
    if os.path.isfile(model_index_path):

        print("There is already a model saved with this name. ")
        time.sleep(print_sleep_interval)
        sys.stdout.flush()
        retrain = False

    # Take the first $n$ most frequent word vectors for a subset.
    # Set to 0 to take entire embedding.
    # Set size of distance vector target
    # (i.e. dimensionality of distance vectors).
    first_n = 10000

    dist_target, useless_labels = process_embedding(emb_path, emb_format,
                                                    first_n, None)

    vectors_matrix, label_df = process_embedding(emb_path, emb_format, 0, None)

    # We get the dimensions of the input dataset.
    shape = vectors_matrix.shape
    print("Shape of embedding matrix: ", shape)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # number of rows in the embedding
    num_inputs = shape[0]
    num_outputs = num_inputs

    # dimensionality of the embedding file
    num_hidden = shape[1]

    print("Learning rate is: ", learning_rate)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # probability of outputting nonzero value in dropout layer. So the
    # input to the dropout layer goes to zero 1 - keep_prob of the time
    print("Dropout layer keep_prob is: ", keep_prob)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # HYPERPARAMETERS
    num_batches = num_inputs // batch_size  # floor division
    print("Defining hyperparameters: ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    print("Epochs: ", epochs)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    print("Batch size: ", batch_size)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    print("Number of batches: ", num_batches)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # clears the default graph stack
    tf.reset_default_graph()

    # PLACEHOLDER
    # "tf.float32" just means the data type is an integer. The shape is
    # in the form [<columns>,<rows>], and "None" means it can be any
    # value. So this placeholder can have any number of rows, and must
    # have "num_inputs" columns.
    print("Initializing placeholder. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    # X = tf.placeholder(tf.float32, shape=[None, num_inputs])
    ''' 
    We used to have the above here, but we change the dimensionality
    of the distance vectors to first_n (10000, usually) so that we 
    run things a bit faster. This reduces the target of our distance
    vector computation to pairwise with the first_n most frequent words. 
    '''
    X = tf.placeholder(tf.float32, shape=[None, first_n])

    # WEIGHTS
    print("Initializing weights. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # we use a variance scaling initializer so that it is capable of
    # adapting its scale to the shape of the weight tensors.
    initializer = tf.variance_scaling_initializer()
    '''
    input_weights = tf.Variable(initializer([num_inputs, num_hidden]), 
                                dtype=tf.float32)
    '''
    input_weights = tf.Variable(initializer([first_n, num_hidden]),
                                dtype=tf.float32)
    '''
    output_weights = tf.Variable(initializer([num_hidden, num_outputs]), 
                                 dtype=tf.float32)
    '''
    output_weights = tf.Variable(initializer([num_hidden, first_n]),
                                 dtype=tf.float32)

    # BIAS
    input_bias = tf.Variable(tf.zeros(num_hidden))

    #output_bias = tf.Variable(tf.zeros(num_outputs))
    output_bias = tf.Variable(tf.zeros(first_n))

    # ACTIVATION
    act_func = tf.nn.relu

    print("Initializing layers and defining loss function. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    #===================================================================

    # LAYERS
    # the argument of act_func is a Tensor, and the variable
    # "hidden_layer" itself is also a Tensor. This hidden layer is just
    # going to compute the element-wise relu

    hidden_layer = act_func(tf.matmul(X, input_weights) + input_bias)

    # With probability keep_prob, outputs the input element scaled up
    # by 1 / keep_prob, otherwise outputs 0. The scaling is so that the
    # expected sum is unchanged.
    dropout_layer = tf.nn.dropout(hidden_layer, keep_prob=keep_prob)
    output_layer = tf.matmul(dropout_layer, output_weights) + output_bias

    # We define our loss function, minimize MSE
    loss_vectors = tf.abs(output_layer - X)
    reduce_mean = tf.reduce_mean(X)
    loss = tf.reduce_mean(tf.abs(output_layer - X))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train = optimizer.minimize(loss)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    # UNIT NORM THE EMBEDDING
    print("Unit norming the embedding. ")
    time.sleep(print_sleep_interval)
    sys.stdout.flush()
    norms_matrix = np.linalg.norm(vectors_matrix, axis=1)
    norms_matrix[norms_matrix == 0] = 1
    vectors_matrix = vectors_matrix / np.expand_dims(norms_matrix, -1)
    print(vectors_matrix.shape)

    # we read the numpy array "vectors_matrix" into tf as a Tensor
    # embedding_tensor = tf.constant(vectors_matrix)
    dist_target_tensor = tf.constant(dist_target)

    # Not doing this anymore due to memory constraints.
    embedding_tensor = vectors_matrix

    print("shape of emb_tens is: ", embedding_tensor.shape)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    embedding_unshuffled = np.copy(embedding_tensor)
    # emb_transpose_unshuf = np.transpose(embedding_unshuffled)
    # emb_transpose_unshuf = tf.cast(emb_transpose_unshuf, tf.float32)
    emb_transpose = tf.transpose(dist_target_tensor)
    emb_transpose = tf.cast(emb_transpose, tf.float32)

    #===================================================================

    with open("./logs/loss_log_" + source_name + ".txt", "w") as f:
        f.write("\n")
        f.write("=====================================================")
        f.write("\n")

    # Dataset shuffler.
    np.random.shuffle(embedding_tensor)

    if retrain:

        for step in tqdm(range(epochs)):
            print("this is the ", step, "th epoch.")

            with open("./logs/loss_log_" + source_name + ".txt", "w") as f:
                f.write("\n")
                f.write(
                    "=====================================================")
                f.write("\n")

            # we instantiate the queue
            seed_queue = mp.Queue()

            mananger = mp.Manager()
            batch_queue = mananger.Queue()

            # So we need each Process to take from an input queue, and
            # to output to an output queue. All 3 batch generation
            # prcoesses will read from the same input queue, and what
            # they will be reading is just an integer which corresponds
            # to an iteration
            for iteration in tqdm(range(num_batches)):
                seed_queue.put(iteration)

            # put in "p" halt seeds to tell the processes when to end
            for i in range(3):
                seed_queue.put(-1)

            # CREATE MATRIXMULT PROCESSES
            batch_args = (embedding_tensor, emb_transpose, label_df,
                          batch_size, seed_queue, batch_queue)
            print("About to start the batch processes. ")
            allprocs = [
                mkproc(next_batch, batch_args) for x in range(num_processes)
            ]

            # RUN THE TRAINING PROCESS
            train_process = mp.Process(
                name="train",
                target=epoch,
                args=(embedding_tensor, num_batches, step, batch_queue, train,
                      loss, loss_vectors, hidden_layer, X, init, saver,
                      model_path, new_emb_path, source_name, retrain))
            train_process.start()

            print("queue is full. ")

            # join the processes, i.e. end them
            for process in allprocs:
                process.join()

            # join the processes, i.e. end them
            for process in allprocs:
                process.terminate()

            print("batch generation functions joined. ")

            train_process.join()

            print("train joined. ")

    #===================================================================
    # THIS PORTION IS FOR SAVING THE RESULTANT EMBEDDING.
    #===================================================================

    # NOTE: Program hangs when I try to run from saved model.
    ''' 
    # Later, launch the model, use the saver to restore variables from 
    # disk, and do some work with the model.
    with tf.Session() as sess:
      
        # Restore variables from disk.
        saver.restore(sess, model_path)
        print("Model restored.")
        
    # Check the values of the variables
    print(embedding_tensor.shape)

    # hidden_out = hidden_layer.eval(feed_dict={X: })
    # for row in hidden_out:
        # print(row) 
    '''

    eval_batch_size = batch_size

    # HYPERPARAMETERS
    eval_num_batches = num_inputs // eval_batch_size  # floor division
    print("Defining hyperparameters: ")
    print("Eval batch size: ", eval_batch_size)
    print("Number of batches: ", eval_num_batches)

    # we instantiate the queue
    seed2_queue = mp.Queue()
    batch2_queue = mp.Queue()

    # So we need each Process to take from an input queue, and
    # to output to an output queue. All 3 batch generation
    # prcoesses will read from the same input queue, and what
    # they will be reading is just an integer which corresponds
    # to an iteration
    for iteration in tqdm(range(eval_num_batches)):
        seed2_queue.put(iteration)

    # put in "p" halt seeds to tell the processes when to end
    for i in range(3):
        seed2_queue.put(-1)

    print("seed queue size: ", seed2_queue.qsize())

    # CREATE MATRIXMULT PROCESSES
    batch_args = (embedding_unshuffled, emb_transpose, label_df,
                  eval_batch_size, seed2_queue, batch2_queue)
    print("About to start the batch processes. ")
    allprocs = [mkproc(next_batch, batch_args) for x in range(num_processes)]

    # the name of the embedding to save
    # something like "~/<path>/steve.txt"
    # new_emb_path = "/homes/3/whitaker.213/eleven_embedding.txt"

    # Tells the program we want to save embedding vectors instead of
    # retrain model weights.
    retrain = False

    # First and only iteration.
    step = 0

    # RUN THE TRAINING PROCESS
    eval_process = mp.Process(
        name="eval",
        target=epoch,
        args=(embedding_unshuffled, eval_num_batches, step, batch2_queue,
              train, loss, loss_vectors, hidden_layer, X, init, saver,
              model_path, new_emb_path, source_name, retrain))
    eval_process.start()

    print("queue is full. ")

    # join the processes, i.e. end them
    for process in allprocs:
        process.join()

    # join the processes, i.e. end them
    for process in allprocs:
        process.terminate()

    eval_process.join()

    return
def genflow(emb_path, emb_format, first_n):

    print_sleep_interval = 1
    print("checkpoint 1")
    check_valid_file(emb_path)
    sys.stdout.flush()

    source_name = os.path.splitext(os.path.basename(emb_path))[0]
    print("Source name:", source_name)
    sys.stdout.flush()

    # take the first n most frequent word vectors for a subset
    # set to 0 to take entire embedding
    first_n = 0

    # Preprocess.
    print("About to preprocess. ")
    sys.stdout.flush()
    vectors_matrix, label_df = process_embedding(emb_path, emb_format, first_n,
                                                 None)
    print("Done preprocessing. ")
    sys.stdout.flush()
    # We get the dimensions of the input dataset.
    shape = vectors_matrix.shape
    print("Shape of embedding matrix: ", shape)
    time.sleep(print_sleep_interval)
    sys.stdout.flush()

    # number of rows in the embedding
    num_inputs = shape[0]
    num_outputs = num_inputs

    # dimensionality of the embedding file
    dim = shape[1]

    #===================================================================

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y-%m-%d-%H%M")

    # The name of the embedding to save.
    parent = os.path.abspath(os.path.join(emb_path, "../"))
    check_valid_dir(parent)

    print("Is anything happening here?")
    sys.stdout.flush()
    transforms = get_config(dim)
    print("Got transforms. ")
    sys.stdout.flush()

    output_embedding_paths = []

    for i, transform in tqdm(enumerate(transforms)):

        func = transform[0]
        arglist = transform[1]

        new_emb_path = str(
            os.path.join(
                parent, "affine-" + str(i) + "__source--" + source_name +
                "__" + "time--" + timestamp + ".bin"))
        sys.stdout.flush()
        output_embedding_paths.append(new_emb_path)

        print("About to start generation.")
        sys.stdout.flush()
        transformed_vectors = func(vectors_matrix, arglist)

        # shape [<num_inputs>,<dimensions>]
        print("labels shape: ", label_df.shape)
        sys.stdout.flush()

        # creates the emb dict
        dist_emb_dict = {}
        for i in tqdm(range(len(label_df))):
            emb_array_row = transformed_vectors[i]
            dist_emb_dict.update({label_df[i]: emb_array_row})
            sys.stdout.flush()

        print("Embedding dict created. ")
        sys.stdout.flush()

        # saves the embedding
        pyemblib.write(dist_emb_dict, new_emb_path, mode=pyemblib.Mode.Binary)

        print("Embedding saved to: " + new_emb_path)

    # Write the output embedding names to a text file.
    outputlist_name = "affine-outputlist__source--" + source_name + "__time--" + timestamp + ".txt"
    outputlist_path = os.path.join(parent, outputlist_name)
    with open(outputlist_path, 'w') as f:
        for path in output_embedding_paths:
            f.write(path + "\n")

    return