Example #1
0
def save_trained_embeddings():
    model = get_prediction_model()

    start_time = time.time()
    count = 0
    tot = len(
        glob.glob("./preprocessing/stored_image_embeddings_train/*.pickle"))
    for file in glob.glob(
            "./preprocessing/stored_image_embeddings_train/*.pickle"):
        store_path = "./preprocessing/trained_image_embeddings/" + file.split(
            '/')[-1]
        trained_image_embeddings = {}
        if not os.path.isfile(store_path):
            image_dict = load_pickle_file(file)
            for image_filepath in image_dict:
                trained_image_embeddings[image_filepath] = model.predict(
                    image_dict[image_filepath])
            save_pickle_file(trained_image_embeddings, store_path)
            print_progress(count,
                           tot,
                           prefix="Saving trained image embeddings")
        else:
            print("Skipping already created file", store_path)
        count += 1
    print("Time to save trained_embeddings: ", time.time() - start_time)
def run_word_preprocessing():
    train_labels = generate_dict_from_directory()

    if not os.path.isfile("preprocessing/labels_embedding.pickle"):
        convert_sentences(train_labels, settings.WORD_EMBEDDING_DIMENSION)

    labels_embedding = load_pickle_file(
        "preprocessing/labels_embedding.pickle")
    return labels_embedding
Example #3
0
def create_cluster(vectors, cluster_path):
    saved_cluster = load_pickle_file(cluster_path)
    if saved_cluster is not None:
        print("Loading saved cluster...")
        return saved_cluster
    print("Creating cluster...")
    vectors = np.asarray(vectors)
    cluster = MiniBatchKMeans(n_clusters=100, random_state=0, init_size=3000)
    # cluster = DBSCAN(metric=cosine_similarity)
    cluster.fit(vectors)
    save_pickle_file(cluster, cluster_path)
    return cluster
Example #4
0
def prepare_training_data(labels_dictionary, location="./train/"):
    """
	:param labels_dictionary: dictionary of labels (key: filename without .jpg, value: 300dim averaged label vector)
	:param location: ./train/ ./test/ ./validation/
	:return: All data to be trained on
	"""

    image_vectors = []
    label_vectors = []

    data_type = location.split("/")[1]
    for folder_path in glob.glob("./preprocessing/stored_image_embeddings_" +
                                 data_type + "/*.pickle"):
        image_dictionary = load_pickle_file(folder_path)

        for image in image_dictionary:
            if image in labels_dictionary:
                image_vectors.append(image_dictionary[image][0])
                label_vectors.append(labels_dictionary[image])
    return [image_vectors, label_vectors]
Example #5
0
def get_cluster():
    cluster_path = "preprocessing/image_vector_cluster.pickle"
    saved_cluster = load_pickle_file(cluster_path)
    if saved_cluster is not None:
        return saved_cluster
    return None
Example #6
0
    cluster = get_cluster()
    model = get_prediction_model()
    for query in queries:

        # This is the image. Just opening if here for the fun of it; not used later
        # query_image = Image.open(location + '/pics/' + query + '.jpg')
        # query_image.show()

        # Generate a random list of 50 entries
        # cluster = [training_labels[random.randint(0, len(training_labels) - 1)] for idx in range(50)]
        image_embedding = embed_image(location + '/pics/' + query + '.jpg')
        trained_image_embedding = predict_vector_on_model(
            image_embedding, model)
        cluster_filenames = compare_to_cluster(trained_image_embedding,
                                               cluster, 50)
        my_return_dict[query] = cluster_filenames
        print_progress(count, tot, prefix="Predicting images")
        count += 1
    return my_return_dict


if __name__ == "__main__":
    start_time = time.time()
    # train()
    labels_dict = load_pickle_file(
        "./validate/pickle/descriptions000000000.pickle")
    predicted_images_dict = test(
        [(f.split("pics/")[-1]).split(".jpg")[0]
         for f in glob.glob("./validate/pics/000000000/*.jpg")], "./validate")
    print("Time: ", time.time() - start_time)