Esempio n. 1
0
def set_picture_label(embedding_id, new_label, cluster, cluster_dict):
    # TODO: Refactor! Extract parts to DBManager?
    # TODO: Don't accept label if it's the same as the old one!
    new_cluster_id = DBManager.get_max_cluster_id() + 1
    embedding = cluster.get_embedding(embedding_id)
    cluster.remove_embedding_by_id(embedding_id)
    new_cluster = Cluster(new_cluster_id, [embedding], [embedding_id], new_label)
    cluster_dict.add_cluster(new_cluster)
    if cluster.get_size() == 0:
        cluster_dict.remove_cluster(cluster)
        modified_clusters = ClusterDict([new_cluster])
    else:
        modified_clusters = ClusterDict([new_cluster, cluster])

    def set_pic_label_worker(con):
        if cluster.get_size() == 0:
            # TODO: Remove cluster like that???
            embeddings_row_dicts = DBManager.remove_cluster(cluster, con=con, close_connections=False)
            emb_id_to_face_dict = make_emb_id_to_face_dict_from_row_dicts(embeddings_row_dicts)
            emb_id_to_img_id_dict = make_emb_id_to_img_id_dict_from_row_dicts(embeddings_row_dicts)
        else:
            emb_id_to_face_dict = None
            emb_id_to_img_id_dict = None
        DBManager.store_clusters(modified_clusters, emb_id_to_face_dict=emb_id_to_face_dict,
                                 emb_id_to_img_id_dict=emb_id_to_img_id_dict, con=con, close_connections=False)
        DBManager.store_certain_labels(cluster=new_cluster, con=con, close_connections=False)

    try:
        DBManager.connection_wrapper(set_pic_label_worker)
    except IncompleteDatabaseOperation:
        cluster.add_embedding(embedding, embedding_id)
        if cluster.get_size() == 0:
            cluster_dict.add_cluster(cluster)
        cluster_dict.remove_cluster(new_cluster)
        raise
Esempio n. 2
0
    def cluster_embeddings_no_split(self,
                                    embeddings,
                                    embeddings_ids=None,
                                    existing_clusters_dict=None,
                                    should_reset_cluster_ids=False,
                                    final_clusters_only=True):
        """
        Build clusters from face embeddings stored in the given path using the specified classification threshold.
        (Currently handled as: All embeddings closer than the distance given by the classification threshold are placed
        in the same cluster. If cluster_save_path is set, store the resulting clusters as directories in the given path.

        :param should_reset_cluster_ids:
        :param embeddings: Iterable containing the embeddings. It embeddings_ids is None, must consist of
        (id, embedding)-pairs
        :param embeddings_ids: Ordered iterable with the embedding ids. Must be at least as long as embeddings.
        :param existing_clusters_dict:
        :param final_clusters_only: If true, only the final iterable of clusters is returned. Otherwise, return that
        final iterable, as well as a list of modified/newly created and deleted clusters
        :return:
        """
        # TODO: Allow embeddings_ids to be none? Get next id via DB query?
        # TODO: Allow embeddings_ids to be shorter than embeddings and 'fill up' remaining ids?
        # embeddings = list(embeddings)
        if not embeddings:
            if final_clusters_only:
                return ClusterDict()
            return ClusterDict(), ClusterDict(), ClusterDict()

        if embeddings_ids is None:
            embeddings_with_ids = embeddings
        else:
            # if len(embeddings) > len(embeddings_ids):
            #     raise ValueError(f'Too few ids for embeddings ({len(embeddings_ids)} passed, but {len(embeddings)}'
            #                      f' needed)')
            embeddings_with_ids = zip(embeddings_ids, embeddings)

        if existing_clusters_dict is None:
            existing_clusters_dict = ClusterDict()
        else:
            # # Don't iterate over embeddings in existing clusters
            # embeddings_with_ids = dict(embeddings_with_ids)
            # existing_embeddings = existing_clusters_dict.get_embeddings()
            # remove_multiple(embeddings_with_ids, existing_embeddings)
            # embeddings_with_ids = embeddings_with_ids.items()
            # Don't iterate over embeddings in existing clusters
            def exists_in_any_cluster(emb_id, _):
                return existing_clusters_dict.any_cluster_with_emb(emb_id)

            embeddings_with_ids = starfilterfalse(exists_in_any_cluster,
                                                  embeddings_with_ids)

        cluster_dict = existing_clusters_dict
        if should_reset_cluster_ids:
            cluster_dict.reset_ids()
            next_cluster_id = cluster_dict.get_max_id() + 1
        else:
            max_existing_id = cluster_dict.get_max_id()
            max_db_id = DBManager.get_max_cluster_id()
            next_cluster_id = max(max_existing_id, max_db_id) + 1

        embeddings_with_ids = list(embeddings_with_ids)
        random.seed(0)
        random.shuffle(embeddings_with_ids)
        modified_clusters_ids, removed_clusters_ids = set(), set()
        for counter, (embedding_id,
                      new_embedding) in enumerate(embeddings_with_ids,
                                                  start=1):
            print_progress(counter, "embedding_id iteration")
            closest_clusters = self.get_closest_clusters(
                cluster_dict, new_embedding)

            # find cluster containing the closest embedding to new_embedding
            shortest_emb_dist, closest_cluster = self.find_closest_cluster_to_embedding(
                closest_clusters, new_embedding)

            if shortest_emb_dist <= self.classification_threshold:
                closest_cluster.add_embedding(new_embedding, embedding_id)
                modified_clusters_ids.add(closest_cluster.cluster_id)
            else:
                new_cluster = Cluster(next_cluster_id, [new_embedding],
                                      [embedding_id])
                next_cluster_id += 1
                cluster_dict.add_cluster(new_cluster)
                modified_clusters_ids.add(new_cluster.cluster_id)

        if final_clusters_only:
            return cluster_dict
        modified_clusters = cluster_dict.get_clusters_by_ids(
            modified_clusters_ids)
        removed_clusters = cluster_dict.get_clusters_by_ids(
            removed_clusters_ids)
        return cluster_dict, ClusterDict(modified_clusters), ClusterDict(
            removed_clusters)
Esempio n. 3
0
    def split_cluster(cls,
                      cluster_to_split,
                      next_cluster_id=None,
                      ret_new_next_id=False):
        """
        Split cluster into two new clusters as follows:
        1. Find two embeddings e1, e2 in the cluster with the greatest distance between them.
        2. Create a new cluster C1, C2 for each of the two.
        3. For each embedding e of the remaining embeddings:
               Add e to the cluster (C1 or C2) whose center is closer to it.

        The given cluster must contain at least 2 embeddings.

        :param ret_new_next_id:
        :param next_cluster_id:
        :param cluster_to_split: Cluster to be split
        :return: Two new clusters containing embeddings of old one
        """
        # TODO: Does this fail due to bad analogy to low-dim. space?!
        embeddings_with_ids = cluster_to_split.get_embeddings(
            with_embeddings_ids=True, as_dict=True)
        (emb1_id, cluster_start_emb1), (
            emb2_id, cluster_start_emb2
        ) = cls.find_most_distant_embeddings(embeddings_with_ids)
        remove_multiple(embeddings_with_ids, [emb1_id, emb2_id])
        label = cluster_to_split.label

        if next_cluster_id is None:
            next_cluster_id = DBManager.get_max_cluster_id() + 1
        new_cluster1_id, new_cluster2_id = next_cluster_id, next_cluster_id + 1
        new_cluster1, new_cluster2 = (Cluster(new_cluster1_id,
                                              [cluster_start_emb1], [emb1_id],
                                              label=label),
                                      Cluster(new_cluster2_id,
                                              [cluster_start_emb2], [emb2_id],
                                              label=label))

        @spread_args_decorator
        @ignore_first_n_args_decorator(n=1)
        def is_closer_to_cluster1(emb):
            dist_to_cluster1 = new_cluster1.compute_dist_to_center(emb)
            dist_to_cluster2 = new_cluster2.compute_dist_to_center(emb)
            return dist_to_cluster1 < dist_to_cluster2

        def try_split(cluster_embs_with_ids, new_cluster):
            split_result = split_items(cluster_embs_with_ids)
            try:
                cluster_embs_ids, cluster_embs = split_result
            except ValueError:
                # not enough values to unpack
                pass
            else:
                new_cluster.add_embeddings(cluster_embs, cluster_embs_ids)

        cluster2_embs_with_ids, cluster1_embs_with_ids = partition(
            is_closer_to_cluster1, embeddings_with_ids.items())
        try_split(cluster1_embs_with_ids, new_cluster1)
        try_split(cluster2_embs_with_ids, new_cluster2)
        new_clusters = (new_cluster1, new_cluster2)

        if ret_new_next_id:
            return new_clusters, new_cluster2_id + 1
        return new_clusters