Example #1
0
def merge_near_duplicates(near_duplicate_objects):
    """ Iteratively merge nearly deduplicated images 

    Args:
        near_duplicate_objects: a list of tuples. 
            Each tuple is a (SimhashIndex, image_dictionary), where image_dictionary 
            is an object which contains simhash keys and image/filename values  
    Returns:
        a dictionary containing simhash keys and image/filename values 
    """

    if near_duplicate_objects == None or len(near_duplicate_objects) == 0:
        return {}

    if len(near_duplicate_objects) == 1:
        # near_duplicate_objects is a tuple (index, image_dictionary)
        return near_duplicate_objects[0][1]

    final_dict = {}
    first_nd = None
    second_nd = None
    for index, (simhash_index, image_dictionary) in enumerate(near_duplicate_objects):
        if index < len(near_duplicate_objects) - 1:
            sim_index1, img_dict1 = near_duplicate_objects[index]
            sim_index2, img_dict2 = near_duplicate_objects[index+1]

            first_nd, second_nd = NearDuplicate([]), NearDuplicate([])
            first_nd.image_dictionary, second_nd.image_dictionary  = img_dict1, img_dict2 
            first_nd.simhash_index, second_nd.simhash_index = sim_index1, sim_index2 

            final_dict.update(first_nd.merge_near_duplicate_dictionaries(second_nd))
            

    return final_dict
def merge_near_duplicates(near_duplicate_objects):
    """ Iteratively merge nearly deduplicated images 

    Args:
        near_duplicate_objects: a list of tuples. 
            Each tuple is a (SimhashIndex, image_dictionary), where image_dictionary 
            is an object which contains simhash keys and image/filename values  
    Returns:
        a dictionary containing simhash keys and image/filename values 
    """

    if near_duplicate_objects == None or len(near_duplicate_objects) == 0:
        return {}

    if len(near_duplicate_objects) == 1:
        # near_duplicate_objects is a tuple (index, image_dictionary)
        return near_duplicate_objects[0][1]

    final_dict = {}
    first_nd = None
    second_nd = None
    for index, (simhash_index, image_dictionary) in enumerate(near_duplicate_objects):
        if index < len(near_duplicate_objects) - 1:
            sim_index1, img_dict1 = near_duplicate_objects[index]
            sim_index2, img_dict2 = near_duplicate_objects[index+1]

            first_nd, second_nd = NearDuplicate([]), NearDuplicate([])
            first_nd.image_dictionary, second_nd.image_dictionary  = img_dict1, img_dict2 
            first_nd.simhash_index, second_nd.simhash_index = sim_index1, sim_index2 

            final_dict.update(first_nd.merge_near_duplicate_dictionaries(second_nd))
            

    return final_dict
Example #3
0
def near_deduplicate_images(file_array, bit_distance, metadata = None):
    """Given a list of file names, return a dictionary of "nearly" deduplicated images"""
    nd = NearDuplicate(file_array, k=bit_distance, metadata_dictionary = metadata)
    nd.deduplicate_images()
    return nd.simhash_index,nd.image_dictionary 
def near_deduplicate_images(file_array, bit_distance, metadata = None):
    """Given a list of file names, return a dictionary of "nearly" deduplicated images"""
    nd = NearDuplicate(file_array, k=bit_distance, metadata_dictionary = metadata)
    nd.deduplicate_images()
    return nd.simhash_index,nd.image_dictionary