def _find_duplicates_dict( self, encoding_map: Dict[str, list], min_similarity_threshold: float, scores: bool, outfile: Optional[str] = None, ) -> Dict: """ Take in dictionary {filename: encoded image}, detects duplicates above the given cosine similarity threshold and returns a dictionary containing key as filename and value as a list of duplicate filenames. Optionally, the cosine distances could be returned instead of just duplicate filenames for each query file. Args: encoding_map: Dictionary with keys as file names and values as encoded images. min_similarity_threshold: Cosine similarity above which retrieved duplicates are valid. scores: Boolean indicating whether similarity scores are to be returned along with retrieved duplicates. Returns: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..} if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..} """ # get all image ids # we rely on dictionaries preserving insertion order in Python >=3.6 image_ids = np.array([*encoding_map.keys()]) # put image encodings into feature matrix features = np.array([*encoding_map.values()]) self.logger.info('Start: Calculating cosine similarities...') self.cosine_scores = get_cosine_similarity(features) # print(self.cosine_scores) np.fill_diagonal( self.cosine_scores, 2.0 ) # allows to filter diagonal in results, 2 is a placeholder value self.logger.info('End: Calculating cosine similarities.') self.results = {} for i, j in enumerate(self.cosine_scores): duplicates_bool = (j >= min_similarity_threshold) & (j < 2) if scores: tmp = np.array([*zip(image_ids, list(map(str, j)))], dtype=object) duplicates = list(map(tuple, tmp[duplicates_bool])) else: duplicates = list(image_ids[duplicates_bool]) self.results[image_ids[i]] = duplicates if outfile: save_json(self.results, outfile) return self.results
def find_duplicates_to_remove( self, image_dir: PurePath = None, encoding_map: Dict[str, np.ndarray] = None, min_similarity_threshold: float = 0.9, outfile: Optional[str] = None, ) -> List: """ Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files. Args: image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding CNN encodings. min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 outfile: Optional, name of the file to save the results, must be a json. Default is None. Returns: duplicates: List of image file names that should be removed. Example: ``` from imagededup.methods import CNN myencoder = CNN() duplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'), min_similarity_threshold=0.85) OR from imagededup.methods import CNN myencoder = CNN() duplicates = myencoder.find_duplicates_to_remove(encoding_map=<mapping filename to cnn encodings>, min_similarity_threshold=0.85, outfile='results.json') ``` """ if image_dir or encoding_map: duplicates = self.find_duplicates( image_dir=image_dir, encoding_map=encoding_map, min_similarity_threshold=min_similarity_threshold, scores=False, ) files_to_remove = get_files_to_remove(duplicates) if outfile: save_json(files_to_remove, outfile) return files_to_remove
def find_duplicates_to_remove( self, image_dir: PosixPath = None, encoding_map: Dict[str, str] = None, max_distance_threshold: int = 10, outfile: Optional[str] = None, ) -> List: """ Give out a list of image file names to remove based on the hamming distance threshold threshold. Does not remove the mentioned files. Args: image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding hashes. max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10. outfile: Optional, name of the file to save the results. Returns: duplicates: List of image file names that are found to be duplicate of me other file in the directory. Example: ``` from imagededup.methods import <hash-method> myencoder = <hash-method>() duplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'), max_distance_threshold=15) OR from imagededup.methods import <hash-method> myencoder = <hash-method>() duplicates = myencoder.find_duplicates(encoding_map=<mapping filename to hashes>, max_distance_threshold=15, outfile='results.json') ``` """ result = self.find_duplicates( image_dir=image_dir, encoding_map=encoding_map, max_distance_threshold=max_distance_threshold, scores=False, ) files_to_remove = get_files_to_remove(result) if outfile: save_json(files_to_remove, outfile) return files_to_remove
def _find_duplicates_dict( self, encoding_map: Dict[str, str], max_distance_threshold: int = 10, scores: bool = False, outfile: Optional[str] = None, ) -> Dict: """ Take in dictionary {filename: encoded image}, detects duplicates below the given hamming distance threshold and returns a dictionary containing key as filename and value as a list of duplicate filenames. Optionally, the hamming distances could be returned instead of just duplicate filenames for each query file. Args: encoding_map: Dictionary with keys as file names and values as encoded images (hashes). max_distance_threshold: Hamming distance between two images below which retrieved duplicates are valid. scores: Boolean indicating whether hamming distance scores are to be returned along with retrieved duplicates. Returns: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..} if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..} """ #print('Start: Evaluating hamming distances for getting duplicates',end='\n', flush=True) result = self.send_data.send( 'pHashing', message='Start: Evaluating hamming distances for getting duplicates' ) result_set = HashEval( test=encoding_map, queries=encoding_map, distance_function=self.hamming_distance, threshold=max_distance_threshold, search_method='bktree', ) #print('End: Evaluating hamming distances for getting duplicates', end='\n', flush=True) result = self.send_data.send( 'pHashing', message='End: Evaluating hamming distances for getting duplicates') self.results = result_set.retrieve_results(scores=scores) if outfile: save_json(self.results, outfile) return self.results
def _find_duplicates_dict( self, encoding_map: Dict[str, str], max_distance_threshold: int = 10, scores: bool = False, outfile: Optional[str] = None, search_method: str = 'brute_force_cython' if not sys.platform == 'win32' else 'bktree', ) -> Dict: """ Take in dictionary {filename: encoded image}, detects duplicates below the given hamming distance threshold and returns a dictionary containing key as filename and value as a list of duplicate filenames. Optionally, the hamming distances could be returned instead of just duplicate filenames for each query file. Args: encoding_map: Dictionary with keys as file names and values as encoded images (hashes). max_distance_threshold: Hamming distance between two images below which retrieved duplicates are valid. scores: Boolean indicating whether hamming distance scores are to be returned along with retrieved duplicates. outfile: Optional, name of the file to save the results. Default is None. search_method: Algorithm used to retrieve duplicates. Default is brute_force_cython for Unix else bktree. Returns: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..} if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..} """ logger.info( 'Start: Evaluating hamming distances for getting duplicates') result_set = HashEval( test=encoding_map, queries=encoding_map, distance_function=self.hamming_distance, verbose=self.verbose, threshold=max_distance_threshold, search_method=search_method, ) logger.info('End: Evaluating hamming distances for getting duplicates') self.results = result_set.retrieve_results(scores=scores) if outfile: save_json(self.results, outfile) return self.results
def test_correct_saving_ints(): res = { 'image1.jpg': [('image1_duplicate1.jpg', 2), ('image1_duplicate2.jpg', 22)], 'image2.jpg': [], 'image3.jpg': [('image1_duplicate1.jpg', 43)], } save_file = 'myduplicates.json' general_utils.save_json(results=res, filename=save_file) with open(save_file, 'r') as f: saved_json = json.load(f) assert len(saved_json) == 3 # all valid files present as keys assert isinstance( saved_json['image1.jpg'][0][1], int ) # saved score is of type 'int' os.remove(save_file) # clean up
def test_correct_saving_floats(): res = { 'image1.jpg': [ ('image1_duplicate1.jpg', np.float16(0.324)), ('image1_duplicate2.jpg', np.float16(0.324)), ], 'image2.jpg': [], 'image3.jpg': [('image1_duplicate1.jpg', np.float32(0.324))], } save_file = 'myduplicates.json' general_utils.save_json(results=res, filename=save_file, float_scores=True) with open(save_file, 'r') as f: saved_json = json.load(f) assert len(saved_json) == 3 # all valid files present as keys assert isinstance( saved_json['image1.jpg'][0][1], float ) # saved score is of type 'float' for np.float16 score assert isinstance( saved_json['image3.jpg'][0][1], float ) # saved score is of type 'float' for np.float32 score os.remove(save_file) # clean up