def search_cluster(keyword): directory = 'downloads/%s' % keyword print('Starting crawler') searcher = crawler() try: print('Searching for %s' % keyword) searcher.search(keyword) print('Downloading') files = searcher.download(32) except: searcher.stop() sys.exit(0) # print('Converting pictures into jpg') # for file in files: # try: # if not imghdr.what(file) == 'jpeg': # im = Image.open(file) # rgb_im = im.convert('RGB') # rgb_im.save(file + '.jpg') # except: # pass images = icio.read_images(directory, size=(224, 224)) # Create Keras NN model. model = calc.get_model() # Feed images through the model and extract fingerprints (feature vectors). print('Feeding images to the neural network to extract features') fingerprints = calc.fingerprints(images, model) # Optionally run a PCA on the fingerprints to compress the dimensions. Use a # cumulative explained variance ratio of 0.95. fingerprints = calc.pca(fingerprints, n_components=0.95) # Run clustering on the fingerprints. Select clusters with similarity index clusters = calc.cluster(fingerprints, sim=0.5) # Create dirs with links to images. Dirs represent the clusters the images # belong to. postproc.make_links(clusters, directory + '/imagecluster/clusters') # Plot images arranged in clusters and save plot. fig, ax = postproc.plot_clusters(clusters, images)
def test_low_level_api_and_clustering(): # use low level API (same as get_image_data) but call all funcs # test clustering with ImagedirCtx() as ctx: images = icio.read_images(ctx.imagedir, size=(224, 224)) model = ic.get_model() fingerprints = ic.fingerprints(images, model) for kk, vv in fingerprints.items(): assert isinstance(vv, np.ndarray) assert len(vv) == 4096, len(vv) fingerprints = ic.pca(fingerprints, n_components=0.95) clusters = ic.cluster(fingerprints, sim=0.5) assert set(clusters.keys()) == set(ctx.clusters.keys()) assert len(fingerprints.keys()) == len(ctx.image_fns) assert set(fingerprints.keys()) == set(ctx.image_fns) for nimg in ctx.clusters.keys(): for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]): msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}" assert set(ref_clus) == set(val_clus), msg
#!/usr/bin/env python3 # Detailed API example. We show which functions are called inside # get_image_data() (read_images(), get_model(), fingerprints(), pca(), # read_timestamps()) and show more options such as time distance scaling. from imagecluster import calc, io as icio, postproc ##images,fingerprints,timestamps = icio.get_image_data( ## 'pics/', ## pca_kwds=dict(n_components=0.95), ## img_kwds=dict(size=(224,224))) # Create image database in memory. This helps to feed images to the NN model # quickly. images = icio.read_images('pics/', size=(224, 224)) # Create Keras NN model. model = calc.get_model() # Feed images through the model and extract fingerprints (feature vectors). fingerprints = calc.fingerprints(images, model) # Optionally run a PCA on the fingerprints to compress the dimensions. Use a # cumulative explained variance ratio of 0.95. fingerprints = calc.pca(fingerprints, n_components=0.95) # Read image timestamps. Need that to calculate the time distance, can be used # in clustering. timestamps = icio.read_timestamps('pics/')
#!/usr/bin/python3 from imagecluster import calc, io as icio, postproc from ecosia_images import crawler searcher = crawler(naming='hash') searcher.search('chilaquiles') searcher.download(50) images = icio.read_images('downloads/chilaquiles', size=(224, 224)) # Create Keras NN model. model = calc.get_model() # Feed images through the model and extract fingerprints (feature vectors). fingerprints = calc.fingerprints(images, model) print(fingerprints) # Optionally run a PCA on the fingerprints to compress the dimensions. Use a # cumulative explained variance ratio of 0.95. fingerprints = calc.pca(fingerprints, n_components=0.95) print(fingerprints) # Read image timestamps. Need that to calculate the time distance, can be used # in clustering. timestamps = icio.read_timestamps('downloads/chilaquiles') # Run clustering on the fingerprints. Select clusters with similarity index # sim=0.5. Mix 80% content distance with 20% timestamp distance (alpha=0.2).