Ejemplo n.º 1
0
    def encode_images(self, image_dir=None, rglob=False):
        """
        Generate hashes for all images in a given directory of images.

        Args:
            image_dir: Path to the image directory.

        Returns:
            dictionary: A dictionary that contains a mapping of filenames and corresponding 64 character hash string
                        such as {'Image1.jpg': 'hash_string1', 'Image2.jpg': 'hash_string2', ...}

        Example:
        ```
        from imagededup.methods import <hash-method>
        myencoder = <hash-method>()
        mapping = myencoder.encode_images('path/to/directory')
        ```
        """
        if not os.path.isdir(image_dir):
            raise ValueError('Please provide a valid directory path!')

        image_dir = Path(image_dir)

        if rglob:
            files = [
                i.absolute() for i in image_dir.rglob('*')
                if not i.name.startswith('.') and i.is_file()
            ]  # ignore hidden files
        else:
            files = [
                i.absolute() for i in image_dir.glob('*')
                if not i.name.startswith('.') and i.is_file()
            ]  # ignore hidden files

        logger.info(f'Start: Calculating hashes...')

        hashes = parallelise(self.encode_image, files, self.verbose)
        hash_initial_dict = dict(zip([f.name for f in files], hashes))
        hash_dict = {
            k: v
            for k, v in hash_initial_dict.items() if v
        }  # To ignore None (returned if some probelm with image file)

        logger.info(f'End: Calculating hashes!')
        return hash_dict
Ejemplo n.º 2
0
def get_cosine_similarity(
    X: np.ndarray, chunk_size: int = 1000, threshold: int = 10000
) -> np.ndarray:
    n_rows = X.shape[0]

    if n_rows <= threshold:
        print('Small feature matrix for calculating cosine similarities...')
        return cosine_similarity(X)

    else:
        print('Large feature matrix thus calculating cosine similarities in chunks...')
        start_idxs = list(range(0, n_rows, chunk_size))
        end_idxs = start_idxs[1:] + [n_rows]
        cos_sim = parallelise(
            cosine_similarity_chunk,
            [(X, idxs) for i, idxs in enumerate(zip(start_idxs, end_idxs))],
        )

        return np.vstack(cos_sim)
Ejemplo n.º 3
0
    def _get_query_results(
            self, search_method_object: Union[BruteForce, BKTree]) -> None:
        """
        Get result for the query using specified search object. Populate the global query_results_map.

        Args:
            search_method_object: BruteForce or BKTree object to get results for the query.
        """
        args = list(
            zip(
                list(self.queries.keys()),
                list(self.queries.values()),
                [search_method_object] * len(self.queries),
                [self.threshold] * len(self.queries),
            ))
        result_map_list = parallelise(self._searcher, args)
        result_map = dict(zip(list(self.queries.keys()), result_map_list))

        self.query_results_map = {
            k: [i for i in sorted(v, key=lambda tup: tup[1], reverse=False)]
            for k, v in result_map.items()
        }  # {'filename.jpg': [('dup1.jpg', 3)], 'filename2.jpg': [('dup2.jpg', 10)]}
def run_parallel(encoder, files):
    from imagededup.utils.general_utils import parallelise
    hashes = parallelise(encoder.encode_image, files, encoder.verbose)
    hash_dict = dict(zip(files, hashes))
    return hash_dict