def main(): clusters_path = os.path.join(IMAGE_PATH, icio.ic_base_dir, 'clusters') # The bottleneck is calc.fingerprints() called in this function, all other # operations are very fast. get_image_data() writes fingerprints to disk and # loads them again instead of re-calculating them. print('\nFingerprinting images...\n') images, fingerprints, timestamps = icio.get_image_data(IMAGE_PATH) print('\nImage fingerprinting done.\n') # Run clustering on the fingerprints. Select clusters with similarity index print('\nClustering images...\n') clusters = calc.cluster(fingerprints, sim=SIMILARITY) print('\nClustering done.\n') # Re-format clusters into a simple 2D list simple_clusters = list() for i, (num_in_cluster, cluster_list) in enumerate(clusters.items()): for cluster in cluster_list: simple_clusters.append(cluster) # Find unclustered images unclustered_images = set(images.keys()) # Start set with all images for cluster in simple_clusters: for image in cluster: unclustered_images = unclustered_images.difference(set([image])) unclustered_images = list(unclustered_images) # Convert to list if ACTION == 'copy': print('\nCopying images to clusters...\n') elif ACTION == 'move': print('\nMoving images to clusters...\n') # Remove existing clusters (if present) if os.path.exists(clusters_path): shutil.rmtree(clusters_path) # Move images into cluster folders cluster_dir_length = len(str(len(simple_clusters))) for i, cluster in enumerate(simple_clusters): cluster_name = str(i).zfill(cluster_dir_length) cluster_dir = os.path.join(clusters_path, cluster_name) os.makedirs(cluster_dir) for image in cluster: if ACTION == 'copy': shutil.copy(os.path.abspath(image), cluster_dir) elif ACTION == 'move': shutil.move(os.path.abspath(image), cluster_dir) # Move unclustered images too for i, image in enumerate(unclustered_images): if ACTION == 'copy': shutil.copy(os.path.abspath(image), clusters_path) elif ACTION == 'move': shutil.move(os.path.abspath(image), clusters_path) print('\nAll done!') print('Clustered images can be found in ' + clusters_path + '\n')
def test_cluster(): # use API # test clustering with ImagedirCtx() as ctx: ias = ic.image_arrays(ctx.imagedir, size=(224, 224)) model = ic.get_model() fps = ic.fingerprints(ias, model) fps = ic.pca(fps, n_components=0.95) clusters = ic.cluster(fps, sim=0.5) assert set(clusters.keys()) == set(ctx.clusters.keys()) for nimg in ctx.clusters.keys(): for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]): msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}" assert set(ref_clus) == set(val_clus), msg
def search_cluster(keyword): directory = 'downloads/%s' % keyword print('Starting crawler') searcher = crawler() try: print('Searching for %s' % keyword) searcher.search(keyword) print('Downloading') files = searcher.download(32) except: searcher.stop() sys.exit(0) # print('Converting pictures into jpg') # for file in files: # try: # if not imghdr.what(file) == 'jpeg': # im = Image.open(file) # rgb_im = im.convert('RGB') # rgb_im.save(file + '.jpg') # except: # pass images = icio.read_images(directory, size=(224, 224)) # Create Keras NN model. model = calc.get_model() # Feed images through the model and extract fingerprints (feature vectors). print('Feeding images to the neural network to extract features') fingerprints = calc.fingerprints(images, model) # Optionally run a PCA on the fingerprints to compress the dimensions. Use a # cumulative explained variance ratio of 0.95. fingerprints = calc.pca(fingerprints, n_components=0.95) # Run clustering on the fingerprints. Select clusters with similarity index clusters = calc.cluster(fingerprints, sim=0.5) # Create dirs with links to images. Dirs represent the clusters the images # belong to. postproc.make_links(clusters, directory + '/imagecluster/clusters') # Plot images arranged in clusters and save plot. fig, ax = postproc.plot_clusters(clusters, images)
def test_low_level_api_and_clustering(): # use low level API (same as get_image_data) but call all funcs # test clustering with ImagedirCtx() as ctx: images = icio.read_images(ctx.imagedir, size=(224, 224)) model = ic.get_model() fingerprints = ic.fingerprints(images, model) for kk, vv in fingerprints.items(): assert isinstance(vv, np.ndarray) assert len(vv) == 4096, len(vv) fingerprints = ic.pca(fingerprints, n_components=0.95) clusters = ic.cluster(fingerprints, sim=0.5) assert set(clusters.keys()) == set(ctx.clusters.keys()) assert len(fingerprints.keys()) == len(ctx.image_fns) assert set(fingerprints.keys()) == set(ctx.image_fns) for nimg in ctx.clusters.keys(): for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]): msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}" assert set(ref_clus) == set(val_clus), msg
# Create Keras NN model. model = calc.get_model() # Feed images through the model and extract fingerprints (feature vectors). fingerprints = calc.fingerprints(images, model) # Optionally run a PCA on the fingerprints to compress the dimensions. Use a # cumulative explained variance ratio of 0.95. fingerprints = calc.pca(fingerprints, n_components=0.95) # Read image timestamps. Need that to calculate the time distance, can be used # in clustering. timestamps = icio.read_timestamps('pics/') # Run clustering on the fingerprints. Select clusters with similarity index # sim=0.5. Mix 80% content distance with 20% timestamp distance (alpha=0.2). clusters = calc.cluster(fingerprints, sim=0.5, timestamps=timestamps, alpha=0.2) # Create dirs with links to images. Dirs represent the clusters the images # belong to. postproc.make_links(clusters, 'pics/imagecluster/clusters') # Plot images arranged in clusters and save plot. fig, ax = postproc.plot_clusters(clusters, images) fig.savefig('foo.png') postproc.plt.show()
from imagecluster import calc as ic from imagecluster import postproc as pp # Create image database in memory. This helps to feed images to the NN model # quickly. ias = ic.image_arrays('pics/', size=(224,224)) # Create Keras NN model. model = ic.get_model() # Feed images through the model and extract fingerprints (feature vectors). fps = ic.fingerprints(ias, model) # Optionally run a PCA on the fingerprints to compress the dimensions. Use a # cumulative explained variance ratio of 0.95. fps = ic.pca(fps, n_components=0.95) # Run clustering on the fingerprints. Select clusters with similarity index # sim=0.5 clusters = ic.cluster(fps, sim=0.5) # Create dirs with links to images. Dirs represent the clusters the images # belong to. pp.make_links(clusters, 'pics/imagecluster/clusters') # Plot images arranged in clusters. pp.visualize(clusters, ias)
#!/usr/bin/python3 from matplotlib import pyplot as plt import numpy as np from scipy.cluster.hierarchy import dendrogram from imagecluster import calc as ic from imagecluster import io as icio images = icio.read_images('pics/', size=(224, 224)) model = ic.get_model() fingerprints = ic.fingerprints(images, model) clusters, extra = ic.cluster(fingerprints, sim=0.5, extra_out=True) # linkage matrix Z fig, ax = plt.subplots() dendrogram(extra['Z'], ax=ax) # Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim` # parameter. ymin, ymax = ax.yaxis.get_data_interval() tlocs = np.linspace(ymin, ymax, 5) ax.yaxis.set_ticks(tlocs) tlabels = np.linspace(1, 0, len(tlocs)) ax.yaxis.set_ticklabels(tlabels) ax.set_xlabel("image index") ax.set_ylabel("sim") fig.savefig('dendrogram.png') plt.show()
from matplotlib import pyplot as plt import numpy as np from scipy.cluster.hierarchy import dendrogram from imagecluster import calc as ic ias = ic.image_arrays('pics/', size=(224, 224)) model = ic.get_model() fps = ic.fingerprints(ias, model) clusters, extra = ic.cluster(fps, sim=0.5, extra_out=True) # linkage matrix Z Z = extra['Z'] fig, ax = plt.subplots() dendrogram(Z, ax=ax) # Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim` # parameter. ymin, ymax = ax.yaxis.get_data_interval() tlocs = np.linspace(ymin, ymax, 5) ax.yaxis.set_ticks(tlocs) tlabels = np.linspace(1, 0, len(tlocs)) ax.yaxis.set_ticklabels(tlabels) ax.set_xlabel("image index") ax.set_ylabel("sim") fig.savefig('dendrogram.png') plt.show()
#!/usr/bin/python3 # Minimal example. Use the convenience function io.get_image_data() without any # extra arguments. from imagecluster import calc, io as icio, postproc # The bottleneck is calc.fingerprints() called in this function, all other # operations are very fast. get_image_data() writes fingerprints to disk and # loads them again instead of re-calculating them. images, fingerprints, timestamps = icio.get_image_data('downloads/cart icon/') # Run clustering on the fingerprints. Select clusters with similarity index # sim=0.5. clusters = calc.cluster(fingerprints, sim=0.5) # Create dirs with links to images. Dirs represent the clusters the images # belong to. postproc.make_links(clusters, 'downloads/cart icon/imagecluster/clusters') # Plot images arranged in clusters. postproc.visualize(clusters, images)
def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False, max_csize=None, pca=False, pca_params=dict(n_components=0.9)): """Example main app using this library. Upon first invocation, the image and fingerprint databases are built and written to disk. Each new invocation loads those and only repeats * clustering * creation of links to files in clusters * visualization (if `vis=True`) This is good for playing around with the `sim` parameter, for instance, which only influences clustering. Parameters ---------- imagedir : str path to directory with images sim : float (0..1) similarity index (see :func:`calc.cluster`) layer : str which layer to use as feature vector (see :func:`calc.get_model`) size : tuple input image size (width, height), must match `model`, e.g. (224,224) links : bool create dirs with links vis : bool plot images in clusters max_csize : max number of images per cluster for visualization (see :mod:`~postproc`) pca : bool Perform PCA on fingerprints before clustering, using `pca_params`. pca_params : dict kwargs to sklearn's PCA Notes ----- imagedir : To select only a subset of the images, create an `imagedir` and symlink your selected images there. In the future, we may add support for passing a list of files, should the need arise. But then again, this function is only an example front-end. """ fps_fn = pj(imagedir, ic_base_dir, 'fingerprints.pk') ias_fn = pj(imagedir, ic_base_dir, 'images.pk') ias = None if not os.path.exists(fps_fn): print(f"no fingerprints database {fps_fn} found") os.makedirs(os.path.dirname(fps_fn), exist_ok=True) model = ic.get_model(layer=layer) if not os.path.exists(ias_fn): print(f"create image array database {ias_fn}") ias = ic.image_arrays(imagedir, size=size) co.write_pk(ias, ias_fn) else: ias = co.read_pk(ias_fn) print("running all images through NN model ...") fps = ic.fingerprints(ias, model) co.write_pk(fps, fps_fn) else: print(f"loading fingerprints database {fps_fn} ...") fps = co.read_pk(fps_fn) if pca: fps = ic.pca(fps, **pca_params) print("pca dims:", list(fps.values())[0].shape[0]) print("clustering ...") clusters = ic.cluster(fps, sim) if links: pp.make_links(clusters, pj(imagedir, ic_base_dir, 'clusters')) if vis: if ias is None: ias = co.read_pk(ias_fn) pp.visualize(clusters, ias, max_csize=max_csize)