def save_trained_embeddings(): model = get_prediction_model() start_time = time.time() count = 0 tot = len( glob.glob("./preprocessing/stored_image_embeddings_train/*.pickle")) for file in glob.glob( "./preprocessing/stored_image_embeddings_train/*.pickle"): store_path = "./preprocessing/trained_image_embeddings/" + file.split( '/')[-1] trained_image_embeddings = {} if not os.path.isfile(store_path): image_dict = load_pickle_file(file) for image_filepath in image_dict: trained_image_embeddings[image_filepath] = model.predict( image_dict[image_filepath]) save_pickle_file(trained_image_embeddings, store_path) print_progress(count, tot, prefix="Saving trained image embeddings") else: print("Skipping already created file", store_path) count += 1 print("Time to save trained_embeddings: ", time.time() - start_time)
def run_word_preprocessing(): train_labels = generate_dict_from_directory() if not os.path.isfile("preprocessing/labels_embedding.pickle"): convert_sentences(train_labels, settings.WORD_EMBEDDING_DIMENSION) labels_embedding = load_pickle_file( "preprocessing/labels_embedding.pickle") return labels_embedding
def create_cluster(vectors, cluster_path): saved_cluster = load_pickle_file(cluster_path) if saved_cluster is not None: print("Loading saved cluster...") return saved_cluster print("Creating cluster...") vectors = np.asarray(vectors) cluster = MiniBatchKMeans(n_clusters=100, random_state=0, init_size=3000) # cluster = DBSCAN(metric=cosine_similarity) cluster.fit(vectors) save_pickle_file(cluster, cluster_path) return cluster
def prepare_training_data(labels_dictionary, location="./train/"): """ :param labels_dictionary: dictionary of labels (key: filename without .jpg, value: 300dim averaged label vector) :param location: ./train/ ./test/ ./validation/ :return: All data to be trained on """ image_vectors = [] label_vectors = [] data_type = location.split("/")[1] for folder_path in glob.glob("./preprocessing/stored_image_embeddings_" + data_type + "/*.pickle"): image_dictionary = load_pickle_file(folder_path) for image in image_dictionary: if image in labels_dictionary: image_vectors.append(image_dictionary[image][0]) label_vectors.append(labels_dictionary[image]) return [image_vectors, label_vectors]
def get_cluster(): cluster_path = "preprocessing/image_vector_cluster.pickle" saved_cluster = load_pickle_file(cluster_path) if saved_cluster is not None: return saved_cluster return None
cluster = get_cluster() model = get_prediction_model() for query in queries: # This is the image. Just opening if here for the fun of it; not used later # query_image = Image.open(location + '/pics/' + query + '.jpg') # query_image.show() # Generate a random list of 50 entries # cluster = [training_labels[random.randint(0, len(training_labels) - 1)] for idx in range(50)] image_embedding = embed_image(location + '/pics/' + query + '.jpg') trained_image_embedding = predict_vector_on_model( image_embedding, model) cluster_filenames = compare_to_cluster(trained_image_embedding, cluster, 50) my_return_dict[query] = cluster_filenames print_progress(count, tot, prefix="Predicting images") count += 1 return my_return_dict if __name__ == "__main__": start_time = time.time() # train() labels_dict = load_pickle_file( "./validate/pickle/descriptions000000000.pickle") predicted_images_dict = test( [(f.split("pics/")[-1]).split(".jpg")[0] for f in glob.glob("./validate/pics/000000000/*.jpg")], "./validate") print("Time: ", time.time() - start_time)