Ejemplo n.º 1
0
def eval_process_image_dir(cluster_dict, images_path, max_num_proc_imgs=None, metric=2, threshold=0.73):
    Models.altered_mtcnn.keep_all = False
    try:
        eval_process_faces(images_path, max_num_proc_imgs=max_num_proc_imgs)
    except IncompleteDatabaseOperation:
        return

    cluster_dict_copy = cluster_dict.copy()

    def eval_process_image_dir_worker(con):
        embeddings_with_ids = list(DBManager.get_all_embeddings(with_ids=True))

        eval_core_algorithm = EvalCoreAlgorithm(metric=metric, classification_threshold=threshold)
        # passing result cluster dict already overwrites it
        clustering_result = eval_core_algorithm.cluster_embeddings_no_split(embeddings_with_ids,
                                                                            existing_clusters_dict=cluster_dict,
                                                                            should_reset_cluster_ids=True,
                                                                            final_clusters_only=False)
        _, modified_clusters_dict, removed_clusters_dict = clustering_result
        DBManager.overwrite_clusters_simplified(modified_clusters_dict, removed_clusters_dict, con=con,
                                                close_connections=False)

    try:
        DBManager.connection_wrapper(eval_process_image_dir_worker)
    except IncompleteDatabaseOperation:
        overwrite_dict(cluster_dict, cluster_dict_copy)
Ejemplo n.º 2
0
def set_picture_label(embedding_id, new_label, cluster, cluster_dict):
    # TODO: Refactor! Extract parts to DBManager?
    # TODO: Don't accept label if it's the same as the old one!
    new_cluster_id = DBManager.get_max_cluster_id() + 1
    embedding = cluster.get_embedding(embedding_id)
    cluster.remove_embedding_by_id(embedding_id)
    new_cluster = Cluster(new_cluster_id, [embedding], [embedding_id], new_label)
    cluster_dict.add_cluster(new_cluster)
    if cluster.get_size() == 0:
        cluster_dict.remove_cluster(cluster)
        modified_clusters = ClusterDict([new_cluster])
    else:
        modified_clusters = ClusterDict([new_cluster, cluster])

    def set_pic_label_worker(con):
        if cluster.get_size() == 0:
            # TODO: Remove cluster like that???
            embeddings_row_dicts = DBManager.remove_cluster(cluster, con=con, close_connections=False)
            emb_id_to_face_dict = make_emb_id_to_face_dict_from_row_dicts(embeddings_row_dicts)
            emb_id_to_img_id_dict = make_emb_id_to_img_id_dict_from_row_dicts(embeddings_row_dicts)
        else:
            emb_id_to_face_dict = None
            emb_id_to_img_id_dict = None
        DBManager.store_clusters(modified_clusters, emb_id_to_face_dict=emb_id_to_face_dict,
                                 emb_id_to_img_id_dict=emb_id_to_img_id_dict, con=con, close_connections=False)
        DBManager.store_certain_labels(cluster=new_cluster, con=con, close_connections=False)

    try:
        DBManager.connection_wrapper(set_pic_label_worker)
    except IncompleteDatabaseOperation:
        cluster.add_embedding(embedding, embedding_id)
        if cluster.get_size() == 0:
            cluster_dict.add_cluster(cluster)
        cluster_dict.remove_cluster(new_cluster)
        raise
Ejemplo n.º 3
0
def reclassify(cluster_dict, embeddings_with_ids=None, con=None, close_connections=True, **kwargs):
    def reclassify_worker(con):
        # all operations in worker, so if any DB operation raises error, it is caught
        if embeddings_with_ids is not None:
            local_embeddings_with_ids = embeddings_with_ids
        else:
            local_embeddings_with_ids = list(DBManager.get_all_embeddings(with_ids=True))

        if not local_embeddings_with_ids:
            log_error('no embeddings found, nothing to edit')
            return

        new_cluster_dict = DBManager.get_certain_clusters()
        core_algorithm = CoreAlgorithm()
        clustering_result = core_algorithm.cluster_embeddings(embeddings=local_embeddings_with_ids,
                                                              existing_clusters_dict=new_cluster_dict,
                                                              should_reset_cluster_ids=True,
                                                              final_clusters_only=False)
        _, modified_clusters_dict, removed_clusters_dict = clustering_result
        DBManager.overwrite_clusters(new_cluster_dict, removed_clusters_dict, no_new_embs=True,
                                     clear_clusters=True, con=con, close_connections=False)
        overwrite_dict(cluster_dict, new_cluster_dict)

    try:
        DBManager.connection_wrapper(reclassify_worker, con=con, close_connections=close_connections)
    except IncompleteDatabaseOperation:
        pass
Ejemplo n.º 4
0
def process_faces(images_path,
                  central_con=None,
                  local_con=None,
                  close_connections=True):
    if local_con is None:
        path_to_local_db = DBManager.get_local_db_file_path(images_path)
    else:
        path_to_local_db = None

    def process_faces_worker(central_con, local_con):
        DBManager.create_local_tables(drop_existing_tables=False,
                                      path_to_local_db=path_to_local_db,
                                      con=local_con,
                                      close_connections=False)
        extract_faces(images_path,
                      central_con=central_con,
                      local_con=local_con,
                      close_connections=False)

    DBManager.connection_wrapper(process_faces_worker,
                                 path_to_local_db=path_to_local_db,
                                 central_con=central_con,
                                 local_con=local_con,
                                 with_central=True,
                                 with_local=True,
                                 close_connections=close_connections)
Ejemplo n.º 5
0
def clear_data(cluster_dict, **kwargs):
    # TODO: Include deletion cascade!
    data_kinds = {
        'l': '[l]ocal tables',
        'g': '[g]lobal tables',
        'b': '[b]oth local and global tables',
        'c': '[c]lusters',
        'n': '[n]either'
    }
    warning = "----- WARNING: DESTRUCTIVE ACTION -----\n"

    should_clear_data_func = partial(
        get_user_decision,
        warning + "Would you like to clear the local/global data?"
        " Don't worry, you will have to re-confirm a 'yes'.")
    data_kind_to_clear_func = partial(get_user_decision,
                                      choices_strs=tuple(data_kinds.values()),
                                      valid_choices=tuple(data_kinds.keys()))

    should_clear_data = should_clear_data_func()
    while should_clear_data == 'y':
        data_kind_to_clear = data_kind_to_clear_func(
            prompt=(warning + "Which kind(s) of data would you like to clear?"
                    " Don't worry, you will have to re-confirm your choice."))
        if data_kind_to_clear == 'n':
            should_clear_data = should_clear_data_func()
            continue

        chosen_data_to_clear_str = data_kinds[data_kind_to_clear].replace(
            '[', '').replace(']', '')
        confirm_data_to_clear = data_kind_to_clear_func(prompt=(
            warning +
            f"Are you sure that you want to clear {chosen_data_to_clear_str}?"
            f" This action cannot be undone. To confirm your choice, simply re-enter it."
        ))

        if confirm_data_to_clear != data_kind_to_clear:
            should_clear_data = should_clear_data_func()
            continue

        def clear_data_worker(con):
            if data_kind_to_clear in ('l', 'b'):
                # TODO: How to use local connections here? Rollback on multiple?
                # clear_local_tables()
                drop_local_tables()
            if data_kind_to_clear in ('g', 'b'):
                # clear_central_tables(con=con, close_connections=False)
                drop_central_tables(con=con, close_connections=False)
                overwrite_dict(cluster_dict, dict())
            if data_kind_to_clear == 'c':
                clear_clustering(con=con, close_connections=False)
                overwrite_dict(cluster_dict, dict())

        try:
            DBManager.connection_wrapper(clear_data_worker)
        except IncompleteDatabaseOperation:
            continue

        should_clear_data = 'n'
Ejemplo n.º 6
0
def set_cluster_label(cluster, new_label):
    # TODO: Use certain_labels here too? (Probably not)
    # TODO: Outsource as function to DBManager?
    def set_cluster_label_worker(con):
        DBManager.store_clusters([cluster], con=con, close_connections=False)
        cluster.set_label(new_label)

    DBManager.connection_wrapper(set_cluster_label_worker)
Ejemplo n.º 7
0
 def process_faces_worker(central_con, local_con):
     DBManager.create_local_tables(drop_existing_tables=False,
                                   path_to_local_db=path_to_local_db,
                                   con=local_con,
                                   close_connections=False)
     extract_faces(images_path,
                   central_con=central_con,
                   local_con=local_con,
                   close_connections=False)
Ejemplo n.º 8
0
 def set_pic_label_worker(con):
     if cluster.get_size() == 0:
         # TODO: Remove cluster like that???
         embeddings_row_dicts = DBManager.remove_cluster(cluster, con=con, close_connections=False)
         emb_id_to_face_dict = make_emb_id_to_face_dict_from_row_dicts(embeddings_row_dicts)
         emb_id_to_img_id_dict = make_emb_id_to_img_id_dict_from_row_dicts(embeddings_row_dicts)
     else:
         emb_id_to_face_dict = None
         emb_id_to_img_id_dict = None
     DBManager.store_clusters(modified_clusters, emb_id_to_face_dict=emb_id_to_face_dict,
                              emb_id_to_img_id_dict=emb_id_to_img_id_dict, con=con, close_connections=False)
     DBManager.store_certain_labels(cluster=new_cluster, con=con, close_connections=False)
Ejemplo n.º 9
0
    def eval_process_image_dir_worker(con):
        embeddings_with_ids = list(DBManager.get_all_embeddings(with_ids=True))

        eval_core_algorithm = EvalCoreAlgorithm(metric=metric, classification_threshold=threshold)
        # passing result cluster dict already overwrites it
        clustering_result = eval_core_algorithm.cluster_embeddings_no_split(embeddings_with_ids,
                                                                            existing_clusters_dict=cluster_dict,
                                                                            should_reset_cluster_ids=True,
                                                                            final_clusters_only=False)
        _, modified_clusters_dict, removed_clusters_dict = clustering_result
        DBManager.overwrite_clusters_simplified(modified_clusters_dict, removed_clusters_dict, con=con,
                                                close_connections=False)
Ejemplo n.º 10
0
def clear_local_tables(local_db_dir_path=None,
                       con=None,
                       close_connections=True):
    if local_db_dir_path is None:
        # none given, allow the user to set path
        local_db_dir_path = user_choose_local_db_dir_path()
    if local_db_dir_path is None:
        # user didn't set path, exit
        return

    path_to_local_db = DBManager.get_local_db_file_path(local_db_dir_path)
    DBManager.clear_local_tables(path_to_local_db,
                                 con=con,
                                 close_connections=close_connections)
Ejemplo n.º 11
0
def eval_extract_faces(path, check_if_known=True, max_num_proc_imgs=None, central_con=None, local_con=None,
                       close_connections=True):
    path_to_local_db = DBManager.get_local_db_file_path(path)
    path_id = DBManager.get_path_id(path)
    if path_id is None:
        # path not yet known
        path_id = DBManager.store_directory_path(path, con=central_con, close_connections=False)
        DBManager.store_path_id(path_id, path_to_local_db=path_to_local_db, con=local_con, close_connections=False)
    imgs_names_and_date = set(DBManager.get_images_attributes(path_to_local_db=path_to_local_db))

    # Note: 'MAX' returns None / (None, ) as a default value
    max_img_id = DBManager.get_max_image_id(path_to_local_db=path_to_local_db)
    start_img_id = max_img_id + 1
    initial_max_embedding_id = DBManager.get_max_embedding_id()

    def get_counted_img_loader():
        img_loader = load_imgs_from_path(path, recursive=True, output_file_names=True, output_file_paths=True)
        if max_num_proc_imgs is not None:
            return zip(range(start_img_id, max_num_proc_imgs + 1), img_loader)
        return enumerate(img_loader, start=start_img_id)

    def store_embedding_row_dicts(con):
        print('----- get_embedding_row_dicts -----')
        # TODO: Also auto-increment emb_id etc.
        embedding_id = initial_max_embedding_id + 1
        for img_id, (img_path, img_name, img) in get_counted_img_loader():
            print_progress(img_id, 'image')

            last_modified = datetime.datetime.fromtimestamp(round(os.stat(img_path).st_mtime))
            if check_if_known and (img_name, last_modified) in imgs_names_and_date:
                continue

            DBManager.store_image(img_id=img_id, rel_file_path=img_name, last_modified=last_modified,
                                  path_to_local_db=path_to_local_db, con=local_con, close_connections=False)
            DBManager.store_image_path(img_id=img_id, path_id=path_id, con=central_con, close_connections=False)

            face = Models.altered_mtcnn.forward_return_results(img)
            if face is None:
                log_error(f"no faces found in image '{img_path}'")
                continue

            embedding_row_dict = {Columns.cluster_id.col_name: 'NULL',
                                  Columns.embedding.col_name: face_to_embedding(face),
                                  Columns.thumbnail.col_name: face,
                                  Columns.image_id.col_name: img_id,
                                  Columns.embedding_id.col_name: embedding_id}
            DBManager.store_embedding(embedding_row_dict, con=con, close_connections=False)
            embedding_id += 1

    DBManager.connection_wrapper(store_embedding_row_dicts, con=central_con, close_connections=close_connections)
Ejemplo n.º 12
0
def run_program_with_user_stats():
    write = False
    command_stats_path = r'C:\Users\Mischa\Desktop\Uni\20-21 WS\Bachelor\BA Papers\Datasets\faces 1999 caltech\commands_stats.txt'
    t0 = time.time()

    # Models.altered_mtcnn.keep_all = False
    init_program()
    cluster_dict = DBManager.load_cluster_dict()

    commands = []
    cmd_name = get_user_command()
    while cmd_name != str(Commands.exit):
        t1 = time.time()
        cmd = Command.get_command(cmd_name)
        call_handler(cmd.handler, cluster_dict=cluster_dict)
        t2 = time.time()
        commands.append([cmd_name, t2 - t1])
        cmd_name = get_user_command()

    tn = time.time()
    commands_str = '\n'.join(map(
        str, commands)) + '\n\n' + f'total runtime: {tn - t0}'
    if write:
        with open(command_stats_path, 'w') as file:
            file.write(commands_str)
Ejemplo n.º 13
0
def process_image_dir(cluster_dict, threshold=0.73, metric=2, **kwargs):
    """
    Extract faces from user-chosen images and cluster them

    :param threshold:
    :param metric:
    :param cluster_dict:
    :param kwargs:
    :return:
    """
    # TODO: Store entered paths(?) --> Makes it easier if user wants to revisit them, but probs rarely?
    images_path = user_choose_images_path()
    try:
        process_faces(images_path)
    except IncompleteDatabaseOperation:
        return

    cluster_dict_copy = cluster_dict.copy()

    def cluster_processed_faces(con):
        embeddings_with_ids = list(DBManager.get_all_embeddings(with_ids=True))

        # TODO: Call reclassify handler here?
        # TODO: Clear existing clusters? Issues with ids etc.????
        core_algorithm = CoreAlgorithm(metric=metric,
                                       classification_threshold=threshold)
        # passing result cluster dict already overwrites it
        clustering_result = core_algorithm.cluster_embeddings(
            embeddings_with_ids,
            existing_clusters_dict=cluster_dict,
            should_reset_cluster_ids=True,
            final_clusters_only=False)
        _, modified_clusters_dict, removed_clusters_dict = clustering_result
        DBManager.overwrite_clusters_simplified(modified_clusters_dict,
                                                removed_clusters_dict,
                                                con=con,
                                                close_connections=False)
        reset_cluster_ids(con=con, close_connections=False)
        new_cluster_dict = DBManager.load_cluster_dict(con=con,
                                                       close_connections=False)
        overwrite_dict(cluster_dict, new_cluster_dict)

    try:
        DBManager.connection_wrapper(cluster_processed_faces)
    except IncompleteDatabaseOperation:
        overwrite_dict(cluster_dict, cluster_dict_copy)
Ejemplo n.º 14
0
        def cluster_processed_faces(con):
            embeddings_with_ids = list(
                DBManager.get_all_embeddings(with_ids=True))

            # TODO: Call reclassify handler here?
            # TODO: Clear existing clusters? Issues with ids etc.????
            core_algorithm = CoreAlgorithm()
            # passing result cluster dict already overwrites it
            clustering_result = core_algorithm.cluster_embeddings(
                embeddings_with_ids,
                existing_clusters_dict=cluster_dict,
                should_reset_cluster_ids=True,
                final_clusters_only=False)
            _, modified_clusters_dict, removed_clusters_dict = clustering_result
            DBManager.overwrite_clusters_simplified(modified_clusters_dict,
                                                    removed_clusters_dict,
                                                    con=con,
                                                    close_connections=False)
Ejemplo n.º 15
0
def run_program():
    init_program()
    cluster_dict = DBManager.load_cluster_dict()

    cmd_name = get_user_command()
    while cmd_name != str(Commands.exit):
        cmd = Command.get_command(cmd_name)
        call_handler(cmd.handler, cluster_dict=cluster_dict)
        cmd_name = get_user_command()
Ejemplo n.º 16
0
    def clear_data_measure(cluster_dict):
        local_db_dir_path = DATASET_PATH
        path_to_local_db = DBManager.get_local_db_file_path(local_db_dir_path)

        def clear_data_worker(central_con, local_con):
            DBManager.clear_local_tables(path_to_local_db,
                                         con=local_con,
                                         close_connections=False)
            clear_central_tables(con=central_con, close_connections=False)
            overwrite_dict(cluster_dict, dict())

        try:
            DBManager.connection_wrapper(clear_data_worker,
                                         path_to_local_db=path_to_local_db,
                                         with_central=True,
                                         with_local=True)
        except IncompleteDatabaseOperation as e:
            print('clear_data_measure error')
            log_error(e)
Ejemplo n.º 17
0
    def store_embedding_row_dicts(con):
        print('----- get_embedding_row_dicts -----')
        # TODO: Also auto-increment emb_id etc.
        embedding_id = initial_max_embedding_id + 1
        for img_id, (img_path, img_name, img) in get_counted_img_loader():
            print_progress(img_id, 'image')

            last_modified = datetime.datetime.fromtimestamp(round(os.stat(img_path).st_mtime))
            if check_if_known and (img_name, last_modified) in imgs_names_and_date:
                continue

            DBManager.store_image(img_id=img_id, rel_file_path=img_name, last_modified=last_modified,
                                  path_to_local_db=path_to_local_db, con=local_con, close_connections=False)
            DBManager.store_image_path(img_id=img_id, path_id=path_id, con=central_con, close_connections=False)

            face = Models.altered_mtcnn.forward_return_results(img)
            if face is None:
                log_error(f"no faces found in image '{img_path}'")
                continue

            embedding_row_dict = {Columns.cluster_id.col_name: 'NULL',
                                  Columns.embedding.col_name: face_to_embedding(face),
                                  Columns.thumbnail.col_name: face,
                                  Columns.image_id.col_name: img_id,
                                  Columns.embedding_id.col_name: embedding_id}
            DBManager.store_embedding(embedding_row_dict, con=con, close_connections=False)
            embedding_id += 1
Ejemplo n.º 18
0
    def process_images_dir_measure(cluster_dict, n):
        images_path = DATASET_PATH
        try:
            print('------ PROCESSING FACES')
            process_faces_measure(images_path, n)
            print('------ DONE PROCESSING')
        except IncompleteDatabaseOperation as e:
            print('process_images_dir_measure error')
            log_error(e)
            return

        cluster_dict_copy = cluster_dict.copy()

        def cluster_processed_faces(con):
            embeddings_with_ids = list(
                DBManager.get_all_embeddings(with_ids=True))

            # TODO: Call reclassify handler here?
            # TODO: Clear existing clusters? Issues with ids etc.????
            core_algorithm = CoreAlgorithm()
            # passing result cluster dict already overwrites it
            clustering_result = core_algorithm.cluster_embeddings(
                embeddings_with_ids,
                existing_clusters_dict=cluster_dict,
                should_reset_cluster_ids=True,
                final_clusters_only=False)
            _, modified_clusters_dict, removed_clusters_dict = clustering_result
            DBManager.overwrite_clusters_simplified(modified_clusters_dict,
                                                    removed_clusters_dict,
                                                    con=con,
                                                    close_connections=False)

        try:
            DBManager.connection_wrapper(cluster_processed_faces)
        except IncompleteDatabaseOperation:
            overwrite_dict(cluster_dict, cluster_dict_copy)
Ejemplo n.º 19
0
    def store_embedding_row_dicts(con):
        # TODO: Also auto-increment emb_id etc.
        max_embedding_id = initial_max_embedding_id
        for img_id, (img_abs_path, img_rel_path,
                     img) in get_counted_img_loader():
            # TODO: Implement automatic deletion cascade! (Using among other things on_conflict clause and FKs)
            #       ---> Done?
            # Check if image already stored --> don't process again
            # known = (name, last modified) as a pair known for this director
            last_modified = datetime.datetime.fromtimestamp(
                round(os.stat(img_abs_path).st_mtime))
            if check_if_known and (img_rel_path,
                                   last_modified) in imgs_rel_paths_and_dates:
                continue

            DBManager.store_image(img_id=img_id,
                                  rel_file_path=img_rel_path,
                                  last_modified=last_modified,
                                  path_to_local_db=path_to_local_db,
                                  con=local_con,
                                  close_connections=False)
            DBManager.store_image_path(img_id=img_id,
                                       path_id=path_id,
                                       con=central_con,
                                       close_connections=False)

            faces = Models.altered_mtcnn.forward_return_results(img)
            if not faces:
                log_error(f"no faces found in image '{img_abs_path}'")
                continue

            # TODO: Better way to create these row_dicts?
            embeddings_row_dicts = [{
                Columns.cluster_id.col_name:
                'NULL',
                Columns.embedding.col_name:
                face_to_embedding(face),
                Columns.thumbnail.col_name:
                face,
                Columns.image_id.col_name:
                img_id,
                Columns.embedding_id.col_name:
                embedding_id
            } for embedding_id, face in enumerate(faces,
                                                  start=max_embedding_id + 1)]
            DBManager.store_embeddings(embeddings_row_dicts,
                                       con=con,
                                       close_connections=False)
            max_embedding_id += len(faces)
Ejemplo n.º 20
0
        def store_embedding_row_dicts(con):
            max_embedding_id = initial_max_embedding_id
            for img_id, (img_path, img_name, img) in get_counted_img_loader():
                # Check if image already stored --> don't process again
                # known = (name, last modified) as a pair known for this director
                last_modified = datetime.datetime.fromtimestamp(
                    round(os.stat(img_path).st_mtime))
                if check_if_known and (img_name,
                                       last_modified) in imgs_names_and_date:
                    continue

                DBManager.store_image(img_id=img_id,
                                      rel_file_path=img_name,
                                      last_modified=last_modified,
                                      path_to_local_db=path_to_local_db,
                                      con=local_con,
                                      close_connections=False)
                DBManager.store_image_path(img_id=img_id,
                                           path_id=path_id,
                                           con=central_con,
                                           close_connections=False)

                faces = Models.altered_mtcnn.forward_return_results(img)
                if not faces:
                    log_error(f"no faces found in image '{img_path}'")
                    continue

                embeddings_row_dicts = [{
                    Columns.cluster_id.col_name:
                    'NULL',
                    Columns.embedding.col_name:
                    face_to_embedding(face),
                    Columns.thumbnail.col_name:
                    face,
                    Columns.image_id.col_name:
                    img_id,
                    Columns.embedding_id.col_name:
                    embedding_id
                } for embedding_id, face in enumerate(
                    faces, start=max_embedding_id + 1)]
                DBManager.store_embeddings(embeddings_row_dicts,
                                           con=con,
                                           close_connections=False)
                max_embedding_id += len(faces)
Ejemplo n.º 21
0
def user_choose_local_db_dir_path():
    # TODO: Refactor, use user_choose function!
    local_db_dir_path = input(
        'Please enter a path containing a local table you would like to clear.\n'
    )
    # local_db_dir_path = (r'C:\Users\Mischa\Desktop\Uni\20-21 WS'
    #                      r'\Bachelor\Programming\BA\Logic\my_test\facenet_Test\group_imgs')
    while True:
        if not local_db_dir_path:
            local_db_dir_path = None
            break
        elif not os.path.exists(local_db_dir_path):
            log_error(f"unable to find path '{local_db_dir_path}'")
        elif not DBManager.is_local_db_in_dir(local_db_dir_path):
            log_error(
                f"unable to find local database file '{...}' in path '{local_db_dir_path}'"
            )
        else:
            break
        print("\nPlease try again.")
        local_db_dir_path = input(
            'Please enter a path with images of people you would like to add.\n'
        )
    return local_db_dir_path
Ejemplo n.º 22
0
    def cluster_embeddings_no_split(self,
                                    embeddings,
                                    embeddings_ids=None,
                                    existing_clusters_dict=None,
                                    should_reset_cluster_ids=False,
                                    final_clusters_only=True):
        """
        Build clusters from face embeddings stored in the given path using the specified classification threshold.
        (Currently handled as: All embeddings closer than the distance given by the classification threshold are placed
        in the same cluster. If cluster_save_path is set, store the resulting clusters as directories in the given path.

        :param should_reset_cluster_ids:
        :param embeddings: Iterable containing the embeddings. It embeddings_ids is None, must consist of
        (id, embedding)-pairs
        :param embeddings_ids: Ordered iterable with the embedding ids. Must be at least as long as embeddings.
        :param existing_clusters_dict:
        :param final_clusters_only: If true, only the final iterable of clusters is returned. Otherwise, return that
        final iterable, as well as a list of modified/newly created and deleted clusters
        :return:
        """
        # TODO: Allow embeddings_ids to be none? Get next id via DB query?
        # TODO: Allow embeddings_ids to be shorter than embeddings and 'fill up' remaining ids?
        # embeddings = list(embeddings)
        if not embeddings:
            if final_clusters_only:
                return ClusterDict()
            return ClusterDict(), ClusterDict(), ClusterDict()

        if embeddings_ids is None:
            embeddings_with_ids = embeddings
        else:
            # if len(embeddings) > len(embeddings_ids):
            #     raise ValueError(f'Too few ids for embeddings ({len(embeddings_ids)} passed, but {len(embeddings)}'
            #                      f' needed)')
            embeddings_with_ids = zip(embeddings_ids, embeddings)

        if existing_clusters_dict is None:
            existing_clusters_dict = ClusterDict()
        else:
            # # Don't iterate over embeddings in existing clusters
            # embeddings_with_ids = dict(embeddings_with_ids)
            # existing_embeddings = existing_clusters_dict.get_embeddings()
            # remove_multiple(embeddings_with_ids, existing_embeddings)
            # embeddings_with_ids = embeddings_with_ids.items()
            # Don't iterate over embeddings in existing clusters
            def exists_in_any_cluster(emb_id, _):
                return existing_clusters_dict.any_cluster_with_emb(emb_id)

            embeddings_with_ids = starfilterfalse(exists_in_any_cluster,
                                                  embeddings_with_ids)

        cluster_dict = existing_clusters_dict
        if should_reset_cluster_ids:
            cluster_dict.reset_ids()
            next_cluster_id = cluster_dict.get_max_id() + 1
        else:
            max_existing_id = cluster_dict.get_max_id()
            max_db_id = DBManager.get_max_cluster_id()
            next_cluster_id = max(max_existing_id, max_db_id) + 1

        embeddings_with_ids = list(embeddings_with_ids)
        random.seed(0)
        random.shuffle(embeddings_with_ids)
        modified_clusters_ids, removed_clusters_ids = set(), set()
        for counter, (embedding_id,
                      new_embedding) in enumerate(embeddings_with_ids,
                                                  start=1):
            print_progress(counter, "embedding_id iteration")
            closest_clusters = self.get_closest_clusters(
                cluster_dict, new_embedding)

            # find cluster containing the closest embedding to new_embedding
            shortest_emb_dist, closest_cluster = self.find_closest_cluster_to_embedding(
                closest_clusters, new_embedding)

            if shortest_emb_dist <= self.classification_threshold:
                closest_cluster.add_embedding(new_embedding, embedding_id)
                modified_clusters_ids.add(closest_cluster.cluster_id)
            else:
                new_cluster = Cluster(next_cluster_id, [new_embedding],
                                      [embedding_id])
                next_cluster_id += 1
                cluster_dict.add_cluster(new_cluster)
                modified_clusters_ids.add(new_cluster.cluster_id)

        if final_clusters_only:
            return cluster_dict
        modified_clusters = cluster_dict.get_clusters_by_ids(
            modified_clusters_ids)
        removed_clusters = cluster_dict.get_clusters_by_ids(
            removed_clusters_ids)
        return cluster_dict, ClusterDict(modified_clusters), ClusterDict(
            removed_clusters)
Ejemplo n.º 23
0
def drop_central_tables(con=None, close_connections=True):
    DBManager.create_central_tables(drop_existing_tables=True,
                                    con=con,
                                    close_connections=close_connections)
Ejemplo n.º 24
0
def clear_clustering(con=None, close_connections=True):
    DBManager.clear_clusters(con=con, close_connections=close_connections)
Ejemplo n.º 25
0
def clear_central_tables(con=None, close_connections=True):
    DBManager.clear_central_tables(con=con,
                                   close_connections=close_connections)
Ejemplo n.º 26
0
def extract_faces(path,
                  check_if_known=True,
                  central_con=None,
                  local_con=None,
                  close_connections=True):
    # TODO: Refactor (extract functions)? + rename
    # TODO: Generate Thumbnails differently? (E.g. via Image.thumbnail or sth. like that)
    # TODO: Store + update max_img_id and max_embedding_id somewhere rather than (always) get them via DB query?

    path_to_local_db = DBManager.get_local_db_file_path(path)
    path_id = DBManager.get_path_id(path)
    if path_id is None:
        # path not yet known
        path_id = DBManager.store_directory_path(path,
                                                 con=central_con,
                                                 close_connections=False)
        DBManager.store_path_id(path_id,
                                path_to_local_db=path_to_local_db,
                                con=local_con,
                                close_connections=False)
    imgs_rel_paths_and_dates = set(
        DBManager.get_images_attributes(path_to_local_db=path_to_local_db))

    # Note: 'MAX' returns None / (None, ) as a default value
    max_img_id = DBManager.get_max_image_id(path_to_local_db=path_to_local_db)
    start_img_id = max_img_id + 1
    initial_max_embedding_id = DBManager.get_max_embedding_id()

    def get_counted_img_loader():
        img_loader = load_imgs_from_path(path,
                                         recursive=True,
                                         output_file_names=True,
                                         output_file_paths=True)
        return enumerate(img_loader, start=start_img_id)

    def store_embedding_row_dicts(con):
        # TODO: Also auto-increment emb_id etc.
        max_embedding_id = initial_max_embedding_id
        for img_id, (img_abs_path, img_rel_path,
                     img) in get_counted_img_loader():
            # TODO: Implement automatic deletion cascade! (Using among other things on_conflict clause and FKs)
            #       ---> Done?
            # Check if image already stored --> don't process again
            # known = (name, last modified) as a pair known for this director
            last_modified = datetime.datetime.fromtimestamp(
                round(os.stat(img_abs_path).st_mtime))
            if check_if_known and (img_rel_path,
                                   last_modified) in imgs_rel_paths_and_dates:
                continue

            DBManager.store_image(img_id=img_id,
                                  rel_file_path=img_rel_path,
                                  last_modified=last_modified,
                                  path_to_local_db=path_to_local_db,
                                  con=local_con,
                                  close_connections=False)
            DBManager.store_image_path(img_id=img_id,
                                       path_id=path_id,
                                       con=central_con,
                                       close_connections=False)

            faces = Models.altered_mtcnn.forward_return_results(img)
            if not faces:
                log_error(f"no faces found in image '{img_abs_path}'")
                continue

            # TODO: Better way to create these row_dicts?
            embeddings_row_dicts = [{
                Columns.cluster_id.col_name:
                'NULL',
                Columns.embedding.col_name:
                face_to_embedding(face),
                Columns.thumbnail.col_name:
                face,
                Columns.image_id.col_name:
                img_id,
                Columns.embedding_id.col_name:
                embedding_id
            } for embedding_id, face in enumerate(faces,
                                                  start=max_embedding_id + 1)]
            DBManager.store_embeddings(embeddings_row_dicts,
                                       con=con,
                                       close_connections=False)
            max_embedding_id += len(faces)

    DBManager.connection_wrapper(store_embedding_row_dicts,
                                 con=central_con,
                                 close_connections=close_connections)
Ejemplo n.º 27
0
 def set_cluster_label_worker(con):
     DBManager.store_clusters([cluster], con=con, close_connections=False)
     cluster.set_label(new_label)
Ejemplo n.º 28
0
def user_choose_embedding_id(cluster):
    # TODO: Refactor
    faces_dict = dict(DBManager.get_thumbnails_from_cluster(cluster.cluster_id, with_embeddings_ids=True))
    chosen_embedding_id = user_choose_embedding_id_worker(faces_dict, cluster.label)
    return chosen_embedding_id
Ejemplo n.º 29
0
    def split_cluster(cls,
                      cluster_to_split,
                      next_cluster_id=None,
                      ret_new_next_id=False):
        """
        Split cluster into two new clusters as follows:
        1. Find two embeddings e1, e2 in the cluster with the greatest distance between them.
        2. Create a new cluster C1, C2 for each of the two.
        3. For each embedding e of the remaining embeddings:
               Add e to the cluster (C1 or C2) whose center is closer to it.

        The given cluster must contain at least 2 embeddings.

        :param ret_new_next_id:
        :param next_cluster_id:
        :param cluster_to_split: Cluster to be split
        :return: Two new clusters containing embeddings of old one
        """
        # TODO: Does this fail due to bad analogy to low-dim. space?!
        embeddings_with_ids = cluster_to_split.get_embeddings(
            with_embeddings_ids=True, as_dict=True)
        (emb1_id, cluster_start_emb1), (
            emb2_id, cluster_start_emb2
        ) = cls.find_most_distant_embeddings(embeddings_with_ids)
        remove_multiple(embeddings_with_ids, [emb1_id, emb2_id])
        label = cluster_to_split.label

        if next_cluster_id is None:
            next_cluster_id = DBManager.get_max_cluster_id() + 1
        new_cluster1_id, new_cluster2_id = next_cluster_id, next_cluster_id + 1
        new_cluster1, new_cluster2 = (Cluster(new_cluster1_id,
                                              [cluster_start_emb1], [emb1_id],
                                              label=label),
                                      Cluster(new_cluster2_id,
                                              [cluster_start_emb2], [emb2_id],
                                              label=label))

        @spread_args_decorator
        @ignore_first_n_args_decorator(n=1)
        def is_closer_to_cluster1(emb):
            dist_to_cluster1 = new_cluster1.compute_dist_to_center(emb)
            dist_to_cluster2 = new_cluster2.compute_dist_to_center(emb)
            return dist_to_cluster1 < dist_to_cluster2

        def try_split(cluster_embs_with_ids, new_cluster):
            split_result = split_items(cluster_embs_with_ids)
            try:
                cluster_embs_ids, cluster_embs = split_result
            except ValueError:
                # not enough values to unpack
                pass
            else:
                new_cluster.add_embeddings(cluster_embs, cluster_embs_ids)

        cluster2_embs_with_ids, cluster1_embs_with_ids = partition(
            is_closer_to_cluster1, embeddings_with_ids.items())
        try_split(cluster1_embs_with_ids, new_cluster1)
        try_split(cluster2_embs_with_ids, new_cluster2)
        new_clusters = (new_cluster1, new_cluster2)

        if ret_new_next_id:
            return new_clusters, new_cluster2_id + 1
        return new_clusters
Ejemplo n.º 30
0
def view_person(cluster_dict, **kwargs):
    """
    1. Fetch which labels exist (incl. Unknown Person)
    2. Prompt user, which person/label they would like to view
    3. Fetch all image names/paths for that person
    4. Prompt user, which image they would like to view
    5. Show image
    6. Go to 2.

    :param cluster_dict:
    :param kwargs:
    :return:
    """
    # TODO: Make user choose file *name*, not path (and just inform them of the path they're on beforehand)
    # TODO: When only one choice (to pick path or image), make choice for user and inform them about it!
    # TODO: Refactor? (Extract functions)
    # TODO: Give option of renaming a file/directory?
    #       --> Best practices? How to do so *safely*?!)
    # TODO: How to include thumbnails and face ids in all of this?
    #       --> Give option to switch to/from edit_handler?

    get_label_decision = partial(get_user_decision,
                                 'Would you like to select another person?')
    get_image_decision = partial(
        get_user_decision,
        'Would you like to view another image of the person from this'
        ' directory?')
    get_directory_decision = partial(
        get_user_decision,
        'Would you like to select another directory containing images'
        ' of the person?')
    cluster_labels = cluster_dict.get_cluster_labels(
        unique=True)  # TODO: faster to use DB??
    # TODO: Extract some loop constructs as functions?
    # TODO: Are these interactions alright?
    # TODO: Catch errors!

    continue_label = ''
    while continue_label != 'n':
        chosen_label = user_choose_label(cluster_labels)
        if chosen_label is None:
            continue_label = get_label_decision()
            continue
        try:
            person_dir_paths_to_img_ids = DBManager.get_dir_paths_to_img_ids(
                chosen_label)
        except IncompleteDatabaseOperation:
            continue_label = get_label_decision()
            continue

        person_dir_paths = person_dir_paths_to_img_ids.keys()

        continue_directory = ''
        while continue_directory != 'n':
            chosen_directory_path = user_choose_directory_path(
                person_dir_paths)
            if chosen_directory_path is None:
                continue_directory = get_directory_decision()
                continue

            image_ids = person_dir_paths_to_img_ids[chosen_directory_path]
            try:
                file_name_to_path_dict = DBManager.get_image_name_to_path_dict(
                    chosen_directory_path, image_ids)
            except IncompleteDatabaseOperation:
                continue_directory = get_directory_decision()
                continue

            continue_image = ''
            while continue_image != 'n':
                print(
                    f"The currently chosen path is: '{chosen_directory_path}'."
                )
                chosen_image_path = user_choose_image_path(
                    file_name_to_path_dict)
                if chosen_image_path is None:
                    continue_image = get_image_decision()
                    continue
                chosen_image = Image.open(chosen_image_path)
                chosen_image.show()
                continue_image = get_image_decision()
            continue_directory = get_directory_decision()
        continue_label = get_label_decision()