def set_picture_label(embedding_id, new_label, cluster, cluster_dict): # TODO: Refactor! Extract parts to DBManager? # TODO: Don't accept label if it's the same as the old one! new_cluster_id = DBManager.get_max_cluster_id() + 1 embedding = cluster.get_embedding(embedding_id) cluster.remove_embedding_by_id(embedding_id) new_cluster = Cluster(new_cluster_id, [embedding], [embedding_id], new_label) cluster_dict.add_cluster(new_cluster) if cluster.get_size() == 0: cluster_dict.remove_cluster(cluster) modified_clusters = ClusterDict([new_cluster]) else: modified_clusters = ClusterDict([new_cluster, cluster]) def set_pic_label_worker(con): if cluster.get_size() == 0: # TODO: Remove cluster like that??? embeddings_row_dicts = DBManager.remove_cluster(cluster, con=con, close_connections=False) emb_id_to_face_dict = make_emb_id_to_face_dict_from_row_dicts(embeddings_row_dicts) emb_id_to_img_id_dict = make_emb_id_to_img_id_dict_from_row_dicts(embeddings_row_dicts) else: emb_id_to_face_dict = None emb_id_to_img_id_dict = None DBManager.store_clusters(modified_clusters, emb_id_to_face_dict=emb_id_to_face_dict, emb_id_to_img_id_dict=emb_id_to_img_id_dict, con=con, close_connections=False) DBManager.store_certain_labels(cluster=new_cluster, con=con, close_connections=False) try: DBManager.connection_wrapper(set_pic_label_worker) except IncompleteDatabaseOperation: cluster.add_embedding(embedding, embedding_id) if cluster.get_size() == 0: cluster_dict.add_cluster(cluster) cluster_dict.remove_cluster(new_cluster) raise
def cluster_embeddings_no_split(self, embeddings, embeddings_ids=None, existing_clusters_dict=None, should_reset_cluster_ids=False, final_clusters_only=True): """ Build clusters from face embeddings stored in the given path using the specified classification threshold. (Currently handled as: All embeddings closer than the distance given by the classification threshold are placed in the same cluster. If cluster_save_path is set, store the resulting clusters as directories in the given path. :param should_reset_cluster_ids: :param embeddings: Iterable containing the embeddings. It embeddings_ids is None, must consist of (id, embedding)-pairs :param embeddings_ids: Ordered iterable with the embedding ids. Must be at least as long as embeddings. :param existing_clusters_dict: :param final_clusters_only: If true, only the final iterable of clusters is returned. Otherwise, return that final iterable, as well as a list of modified/newly created and deleted clusters :return: """ # TODO: Allow embeddings_ids to be none? Get next id via DB query? # TODO: Allow embeddings_ids to be shorter than embeddings and 'fill up' remaining ids? # embeddings = list(embeddings) if not embeddings: if final_clusters_only: return ClusterDict() return ClusterDict(), ClusterDict(), ClusterDict() if embeddings_ids is None: embeddings_with_ids = embeddings else: # if len(embeddings) > len(embeddings_ids): # raise ValueError(f'Too few ids for embeddings ({len(embeddings_ids)} passed, but {len(embeddings)}' # f' needed)') embeddings_with_ids = zip(embeddings_ids, embeddings) if existing_clusters_dict is None: existing_clusters_dict = ClusterDict() else: # # Don't iterate over embeddings in existing clusters # embeddings_with_ids = dict(embeddings_with_ids) # existing_embeddings = existing_clusters_dict.get_embeddings() # remove_multiple(embeddings_with_ids, existing_embeddings) # embeddings_with_ids = embeddings_with_ids.items() # Don't iterate over embeddings in existing clusters def exists_in_any_cluster(emb_id, _): return existing_clusters_dict.any_cluster_with_emb(emb_id) embeddings_with_ids = starfilterfalse(exists_in_any_cluster, embeddings_with_ids) cluster_dict = existing_clusters_dict if should_reset_cluster_ids: cluster_dict.reset_ids() next_cluster_id = cluster_dict.get_max_id() + 1 else: max_existing_id = cluster_dict.get_max_id() max_db_id = DBManager.get_max_cluster_id() next_cluster_id = max(max_existing_id, max_db_id) + 1 embeddings_with_ids = list(embeddings_with_ids) random.seed(0) random.shuffle(embeddings_with_ids) modified_clusters_ids, removed_clusters_ids = set(), set() for counter, (embedding_id, new_embedding) in enumerate(embeddings_with_ids, start=1): print_progress(counter, "embedding_id iteration") closest_clusters = self.get_closest_clusters( cluster_dict, new_embedding) # find cluster containing the closest embedding to new_embedding shortest_emb_dist, closest_cluster = self.find_closest_cluster_to_embedding( closest_clusters, new_embedding) if shortest_emb_dist <= self.classification_threshold: closest_cluster.add_embedding(new_embedding, embedding_id) modified_clusters_ids.add(closest_cluster.cluster_id) else: new_cluster = Cluster(next_cluster_id, [new_embedding], [embedding_id]) next_cluster_id += 1 cluster_dict.add_cluster(new_cluster) modified_clusters_ids.add(new_cluster.cluster_id) if final_clusters_only: return cluster_dict modified_clusters = cluster_dict.get_clusters_by_ids( modified_clusters_ids) removed_clusters = cluster_dict.get_clusters_by_ids( removed_clusters_ids) return cluster_dict, ClusterDict(modified_clusters), ClusterDict( removed_clusters)
def split_cluster(cls, cluster_to_split, next_cluster_id=None, ret_new_next_id=False): """ Split cluster into two new clusters as follows: 1. Find two embeddings e1, e2 in the cluster with the greatest distance between them. 2. Create a new cluster C1, C2 for each of the two. 3. For each embedding e of the remaining embeddings: Add e to the cluster (C1 or C2) whose center is closer to it. The given cluster must contain at least 2 embeddings. :param ret_new_next_id: :param next_cluster_id: :param cluster_to_split: Cluster to be split :return: Two new clusters containing embeddings of old one """ # TODO: Does this fail due to bad analogy to low-dim. space?! embeddings_with_ids = cluster_to_split.get_embeddings( with_embeddings_ids=True, as_dict=True) (emb1_id, cluster_start_emb1), ( emb2_id, cluster_start_emb2 ) = cls.find_most_distant_embeddings(embeddings_with_ids) remove_multiple(embeddings_with_ids, [emb1_id, emb2_id]) label = cluster_to_split.label if next_cluster_id is None: next_cluster_id = DBManager.get_max_cluster_id() + 1 new_cluster1_id, new_cluster2_id = next_cluster_id, next_cluster_id + 1 new_cluster1, new_cluster2 = (Cluster(new_cluster1_id, [cluster_start_emb1], [emb1_id], label=label), Cluster(new_cluster2_id, [cluster_start_emb2], [emb2_id], label=label)) @spread_args_decorator @ignore_first_n_args_decorator(n=1) def is_closer_to_cluster1(emb): dist_to_cluster1 = new_cluster1.compute_dist_to_center(emb) dist_to_cluster2 = new_cluster2.compute_dist_to_center(emb) return dist_to_cluster1 < dist_to_cluster2 def try_split(cluster_embs_with_ids, new_cluster): split_result = split_items(cluster_embs_with_ids) try: cluster_embs_ids, cluster_embs = split_result except ValueError: # not enough values to unpack pass else: new_cluster.add_embeddings(cluster_embs, cluster_embs_ids) cluster2_embs_with_ids, cluster1_embs_with_ids = partition( is_closer_to_cluster1, embeddings_with_ids.items()) try_split(cluster1_embs_with_ids, new_cluster1) try_split(cluster2_embs_with_ids, new_cluster2) new_clusters = (new_cluster1, new_cluster2) if ret_new_next_id: return new_clusters, new_cluster2_id + 1 return new_clusters