def get_bags_of_sifts(image_paths): with open('vocab.pkl', 'rb') as handle: vocab = pickle.load(handle) image_feats = [] start_time = time() print("Construct bags of sifts...") for path in image_paths: img = np.asarray(Image.open(path), dtype='float32') frames, descriptors = dsift(img, step=[1, 1], fast=True) dist = distance.cdist(vocab, descriptors, metric='euclidean') idx = np.argmin(dist, axis=0) hist, bin_edges = np.histogram(idx, bins=len(vocab)) hist_norm = [float(i) / sum(hist) for i in hist] image_feats.append(hist_norm) image_feats = np.asarray(image_feats) end_time = time() print("It takes ", (start_time - end_time), " to construct bags of sifts.") return image_feats
def build_vocabulary(image_paths, vocab_size, step): ################################################################################## # TODO: # # Load images from the training set. To save computation time, you don't # # necessarily need to sample from all images, although it would be better # # to do so. You can randomly sample the descriptors from each image to save # # memory and speed up the clustering. Or you can simply call vl_dsift with # # a large step size here. # # # # For each loaded image, get some SIFT features. You don't have to get as # # many SIFT features as you will in get_bags_of_sift.py, because you're only # # trying to get a representative sample here. # # # # Once you have tens of thousands of SIFT features from many training # # images, cluster them with kmeans. The resulting centroids are now your # # visual word vocabulary. # ################################################################################## ################################################################################## # NOTE: Some useful functions # # This function will sample SIFT descriptors from the training images, # # cluster them with kmeans, and then return the cluster centers. # # # # Function : dsift() # # SIFT_features is a N x 128 matrix of SIFT features # # There are step, bin size, and smoothing parameters you can # # manipulate for dsift(). We recommend debugging with the 'fast' # # parameter. This approximate version of SIFT is about 20 times faster to # # compute. Also, be sure not to use the default value of step size. It will # # be very slow and you'll see relatively little performance gain from # # extremely dense sampling. You are welcome to use your own SIFT feature. # # # # Function : kmeans(X, K) # # X is a M x d matrix of sampled SIFT features, where M is the number of # # features sampled. M should be pretty large! # # K is the number of clusters desired (vocab_size) # # centers is a d x K matrix of cluster centroids. # # # # NOTE: # # e.g. 1. dsift(img, step=[?,?], fast=True) # # 2. kmeans( ? , vocab_size) # # # # ################################################################################ bag_of_features = [] print("Extract SIFT features") for path in image_paths: img = np.asarray(Image.open(path), dtype='float32') frames, descriptors = dsift(img, step, fast=True) bag_of_features.append(descriptors) bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') print("Compute vocab") start_t = time() vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") end_t = time() print("It takes ", (start_t - end_t), " to compute vocab.") ################################################################################## # END OF YOUR CODE # ################################################################################## return vocab
def test_dsift_sift(window_size): bin_size = 4 magnif = 3 scale = bin_size / magnif img_smooth = gaussian(img, sigma=sqrt(scale**2 - 0.25)) f, d = dsift(img_smooth, size=bin_size, step=10, window_size=window_size, float_descriptors=True) num_keypoints = f.shape[0] f_ = np.column_stack([ f, np.ones(shape=(num_keypoints, )) * scale, np.zeros(shape=(num_keypoints, )) ]) f_, d_ = sift(img, magnification=magnif, frames=f_, first_octave=-1, n_levels=5, compute_descriptor=True, float_descriptors=True, window_size=window_size) err = np.std(d - d_) / np.std(d) assert err < 0.1
def image_class(images, features): image_feats = [] print("Construct bags of sifts...") for key, value in tqdm(images.items()): empty = [0 for i in range(0, len(features))] for img in value: # orb = cv2.ORB_create() # orb = cv2.xfeatures2d.SIFT_create() # keypoints, descriptors = orb.detectAndCompute(img, None) _, descriptors = dsift(img, step=[5, 5], fast=True) if descriptors is not None: dist = distance.cdist(features, descriptors, metric='euclidean') idx = np.argmin(dist, axis=0) hist, bin_edges = np.histogram(idx, bins=len(features)) hist_norm = [float(i) / sum(hist) for i in hist] image_feats.append(hist_norm) else: print("NONE") image_feats.append(empty) image_feats = np.asarray(image_feats) return image_feats
def test_dsift_steps(): i = half_img.copy() # Step 3 in Y-Direction, 4 in X-Direction frames, descriptors = dsift(i, step=[3, 4]) assert frames.shape[0] == 10416 assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 8.5], [4.5, 12.5]], rtol=1e-3)
def test_dsift_fast(): i = half_img.copy() frames, descriptors = dsift(i, fast=True) assert frames.shape[0] == 124241 assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]], rtol=1e-3) assert_allclose(descriptors[0, -3:], [61, 45, 60], rtol=1e-3)
def test_dsift_windowsize(): i = half_img.copy() frames, descriptors = dsift(i, window_size=3) assert frames.shape[0] == 124241 assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]], rtol=1e-3) assert_allclose(descriptors[0, -3:], [74, 55, 71], rtol=1e-3)
def test_dsift_windowsize(): frames, descriptors = dsift(img, window_size=3) assert frames.shape[0] == 253009 assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]], rtol=1e-3) assert_allclose(descriptors[0, :10], [99, 0, 0, 0, 0, 0, 157, 24, 52, 0], rtol=1e-3)
def test_dsift_steps(): # Step 3 in Y-Direction, 4 in X-Direction frames, descriptors = dsift(img, step=[3, 4]) assert frames.shape[0] == 21168 assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 8.5], [4.5, 12.5]], rtol=1e-3) assert_allclose(descriptors[0, :10], [99, 0, 0, 0, 0, 0, 150, 24, 56, 0])
def get_bags_of_sifts(image_paths): ############################################################################ # TODO: # # This function assumes that 'vocab.pkl' exists and contains an N x 128 # # matrix 'vocab' where each row is a kmeans centroid or visual word. This # # matrix is saved to disk rather than passed in a parameter to avoid # # recomputing the vocabulary every time at significant expense. # # image_feats is an N x d matrix, where d is the dimensionality of the # # feature representation. In this case, d will equal the number of clusters# # or equivalently the number of entries in each image's histogram. # # You will want to construct SIFT features here in the same way you # # did in build_vocabulary.m (except for possibly changing the sampling # # rate) and then assign each local feature to its nearest cluster center # # and build a histogram indicating how many times each cluster was used. # # Don't forget to normalize the histogram, or else a larger image with more# # SIFT features will look very different from a smaller version of the same# # image. # ############################################################################ ''' Input : image_paths : a list(N) of training images Output : image_feats : (N, d) feature, each row represent a feature of an image ''' with open('vocab.pkl', 'rb') as handle: vocab = pickle.load(handle) image_feats = [] start_time = time() print("Construct bags of sifts...") for path in image_paths: img = np.asarray(Image.open(path), dtype='float32') frames, descriptors = dsift(img, step=[1, 1], fast=True) dist = distance.cdist(vocab, descriptors, metric='euclidean') idx = np.argmin(dist, axis=0) hist, bin_edges = np.histogram(idx, bins=len(vocab)) hist_norm = [float(i) / sum(hist) for i in hist] image_feats.append(hist_norm) image_feats = np.asarray(image_feats) end_time = time() print("It takes ", (start_time - end_time), " to construct bags of sifts.") ############################################################################# # END OF YOUR CODE # ############################################################################# return image_feats
def test_dsift_norm(): i = half_img.copy() frames, descriptors = dsift(i, norm=True) assert frames.shape[-1] == 3 assert frames.shape[0] == 124241 assert_allclose(frames[:3], [[4.5, 4.5, 1.6537], [4.5, 5.5, 1.7556], [4.5, 6.5, 1.8581]], rtol=1e-3) assert_allclose(descriptors[0, -3:], [65, 48, 62], rtol=1e-3)
def test_dsift_norm(): frames, descriptors = dsift(img, norm=True) assert frames.shape[-1] == 3 assert frames.shape[0] == 253009 print(frames) assert_allclose( frames[:3], [[4.5, 4.5, 0.2953], [4.5, 5.5, 0.2471], [4.5, 6.5, 0.2115]], rtol=1e-3) assert_allclose(descriptors[0, :10], [99, 0, 0, 0, 0, 0, 150, 24, 56, 0], rtol=1e-3)
def get_bags_of_sifts(image_paths): ############################################################################ # TODO: # # This function assumes that 'vocab.pkl' exists and contains an N x 128 # # matrix 'vocab' where each row is a kmeans centroid or visual word. This # # matrix is saved to disk rather than passed in a parameter to avoid # # recomputing the vocabulary every time at significant expense. # # image_feats is an N x d matrix, where d is the dimensionality of the # # feature representation. In this case, d will equal the number of clusters# # or equivalently the number of entries in each image's histogram. # # You will want to construct SIFT features here in the same way you # # did in build_vocabulary.m (except for possibly changing the sampling # # rate) and then assign each local feature to its nearest cluster center # # and build a histogram indicating how many times each cluster was used. # # Don't forget to normalize the histogram, or else a larger image with more# # SIFT features will look very different from a smaller version of the same# # image. # ############################################################################ ''' Input : image_paths : a list(N) of training images Output : image_feats : (N, d) feature, each row represent a feature of an image ''' # load vocab.pkl with open('vocab_400.pkl','rb') as f: vocab = pickle.load(f) centroid_num = vocab.shape[0]; # initial output image_feats = np.zeros([len(image_paths),centroid_num]) for idx,path in enumerate(image_paths): img = np.asarray(Image.open(path),dtype='float32'); frames, descriptors = dsift(img, step=[5,5], fast=True); dist = distance.cdist(vocab, descriptors, 'euclidean'); category_result = np.argmin(dist,axis=0); hist_value, bins = np.histogram(category_result,bins = range(centroid_num+1)); # range(0,centroid_num) normalize = np.linalg.norm(hist_value,ord=1,axis=0); if normalize == 0: image_feats[idx,:] = hist_value; else: image_feats[idx,:] = hist_value / normalize; ############################################################################# # END OF YOUR CODE # ############################################################################# return image_feats
def create_sift_discription(train_data): description_bag = [] sift_data_dic = {} for key, value in train_data.items(): temp_list = [] for i in range(len(value)): kp, des = dsift(value[i][1], step=[5, 5], fast=True) description_bag.extend(des) temp_list.append(des) sift_data_dic[key] = temp_list return description_bag, sift_data_dic
def get_bags_of_sifts(image_paths): ############################################################################ # TODO: # # This function assumes that 'vocab.pkl' exists and contains an N x 128 # # matrix 'vocab' where each row is a kmeans centroid or visual word. This # # matrix is saved to disk rather than passed in a parameter to avoid # # recomputing the vocabulary every time at significant expense. # # image_feats is an N x d matrix, where d is the dimensionality of the # # feature representation. In this case, d will equal the number of clusters# # or equivalently the number of entries in each image's histogram. # # You will want to construct SIFT features here in the same way you # # did in build_vocabulary.m (except for possibly changing the sampling # # rate) and then assign each local feature to its nearest cluster center # # and build a histogram indicating how many times each cluster was used. # # Don't forget to normalize the histogram, or else a larger image with more# # SIFT features will look very different from a smaller version of the same# # image. # ############################################################################ ''' Input : image_paths : a list(N) of training images Output : image_feats : (N, d) feature, each row represent a feature of an image ''' image_feats = [] vocab = pickle.load(open('vocab.pkl', 'rb')) for image_path in image_paths: img = np.asarray(Image.open(image_path), dtype='float32') frames, descriptors = dsift(img, step=[5, 5], fast=True) distance_matrix = distance.cdist(descriptors, vocab, 'euclidean') feature_idx = np.argmin(distance_matrix, axis=1) unique, counts = np.unique(feature_idx, return_counts=True) counter = dict(zip(unique, counts)) histogram = np.zeros(vocab.shape[0]) for idx, count in counter.items(): histogram[idx] = count histogram = histogram / histogram.sum() image_feats.append(histogram) print(image_path) image_feats = np.asarray(image_feats) ############################################################################# # END OF YOUR CODE # ############################################################################# return image_feats
def test_dsift_slow_fast(): # bin size in pixels bin_size = 4 # bin size / keypoint scale magnif = 3 scale = bin_size / magnif window_size = 5 img_smooth = gaussian(img, sigma=sqrt(scale**2 - 0.25)) _, d = dsift(img_smooth, size=bin_size, step=10, window_size=window_size, float_descriptors=True) _, d_ = dsift(img_smooth, size=bin_size, step=10, window_size=window_size, float_descriptors=True, fast=True) err = np.std(d_ - d) / np.std(d) assert err < 0.1
def build_vocabulary(image_paths, vocab_size): feature_bag = [] print("SIFT features extracting") for image_path in image_paths: image = np.asarray(Image.open(image_path),dtype='float32') frames, descriptors = dsift(image, step=[5,5], fast=True) feature_bag.append(descriptors) feature_bag = np.concatenate(feature_bag, axis=0).astype('float32') print("Computing vocabulary") vocabulary = kmeans(feature_bag, vocab_size, initialization="PLUSPLUS") return vocabulary
def get_bags_of_sifts(image_paths): ############################################################################ # TODO: # # This function assumes that 'vocab.pkl' exists and contains an N x 128 # # matrix 'vocab' where each row is a kmeans centroid or visual word. This # # matrix is saved to disk rather than passed in a parameter to avoid # # recomputing the vocabulary every time at significant expense. # # image_feats is an N x d matrix, where d is the dimensionality of the # # feature representation. In this case, d will equal the number of clusters# # or equivalently the number of entries in each image's histogram. # # You will want to construct SIFT features here in the same way you # # did in build_vocabulary.m (except for possibly changing the sampling # # rate) and then assign each local feature to its nearest cluster center # # and build a histogram indicating how many times each cluster was used. # # Don't forget to normalize the histogram, or else a larger image with more# # SIFT features will look very different from a smaller version of the same# # image. # ############################################################################ ''' Input : image_paths : a list(N) of training images Output : image_feats : (N, d) feature, each row represent a feature of an image ''' image_feats = None with open('vocab.pkl', 'rb') as voc: vocal_feats = pickle.load(voc) record = np.zeros(vocal_feats.shape[0]) image_feats = np.zeros((len(image_paths), vocal_feats.shape[0])) for i, path in enumerate(image_paths): img = np.asarray(Image.open(path),dtype='float32') frames, descriptors = dsift(img, step=[5,5], fast=True) L2_dis = distance.cdist(vocal_feats, descriptors, 'euclidean') for j in range(descriptors.shape[0]): min_index = np.argmin(L2_dis[:, j]) record[min_index] = record[min_index] + 1 average = np.mean(record) deviate = np.std(record) img_normal = (record - average) / deviate image_feats[i, :] = img_normal record = np.zeros(vocal_feats.shape[0]) ############################################################################# # END OF YOUR CODE # ############################################################################# return image_feats
def build_vocabulary(image_paths, vocab_size): bag_of_features = [] print("Extract SIFT features") for path in image_paths: print(path) img = np.asarray(Image.open(path),dtype='float32') frames, descriptors = dsift(img, step=[5,5], fast=True) bag_of_features.append(descriptors) bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') print("Compute vocab") start_time = time() vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") end_time = time() print("It takes ", (start_time - end_time), " to compute vocab.") return vocab
def get_bags_of_sifts(image_paths): with open('vocab.pkl', 'rb') as vocab: vocabulary = pickle.load(vocab) image_feature = np.zeros((len(image_paths),len(vocabulary))) for i, path in enumerate(image_paths): image = np.asarray(Image.open(path), dtype = 'float32') frames, descriptors = dsift(image, step=[9,9], fast=True) dist = distance.cdist(vocabulary, descriptors, 'euclidean') mdist = np.argmin(dist, axis = 0) histo, bins = np.histogram(mdist, range(len(vocabulary)+1)) if np.linalg.norm(histo) != 0: image_feature[i, :] = histo / np.linalg.norm(histo) elif np.linalg.norm(histo) == 0: image_feature[i, :] = histo else: print("something wrong, check the np") return image_feature
def sift_features(images, size): print("feature number", size) bag_of_features = [] print("Extract SIFT features...") for key, value in tqdm(images.items()): for img in value: # orb = cv2.xfeatures2d.SIFT_create(500) # orb = cv2.ORB_create() # keypoints, descriptors = orb.detectAndCompute(img, None) _, descriptors = dsift(img, step=[5, 5], fast=True) if descriptors is not None: for des in descriptors: bag_of_features.append(des) print("Compute kmeans in dimensions:", size) km = kmeans(np.array(bag_of_features).astype('float32'), size, initialization="PLUSPLUS") return km
def get_bags_of_sifts(image_paths): image_feats = [] vocab = pickle.load(open('vocab.pkl', 'rb')) for image_path in image_paths: img = np.asarray(Image.open(image_path), dtype='float32') frames, descriptors = dsift(img, step=[5, 5], fast=True) distance_matrix = distance.cdist(descriptors, vocab, 'euclidean') feature_idx = np.argmin(distance_matrix, axis=1) unique, counts = np.unique(feature_idx, return_counts=True) counter = dict(zip(unique, counts)) histogram = np.zeros(vocab.shape[0]) for idx, count in counter.items(): histogram[idx] = count histogram = histogram / histogram.sum() image_feats.append(histogram) print(image_path) image_feats = np.asarray(image_feats) return image_feats
def test_dsift_float_descriptors(): _, descriptors = dsift(img, float_descriptors=True) assert descriptors.dtype == np.float32
def test_dsift_float_descriptors(): i = img.copy() frames, descriptors = dsift(i, float_descriptors=True) assert descriptors.dtype == np.float32
def test_dsift_non_float_descriptors(): i = img.copy() frames, descriptors = dsift(i, float_descriptors=False) assert descriptors.dtype == np.uint8
def build_vocabulary(image_paths, vocab_size): ''' This function should sample HOG descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Inputs: image_paths: a Python list of image path strings vocab_size: an integer indicating the number of words desired for the bag of words vocab set Outputs: a vocab_size x (z*z*9) (see below) array which contains the cluster centers that result from the K Means clustering. You'll need to generate HOG features using the skimage.feature.hog() function. The documentation is available here: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog However, the documentation is a bit confusing, so we will highlight some important arguments to consider: cells_per_block: The hog function breaks the image into evenly-sized blocks, which are further broken down into cells, each made of pixels_per_cell pixels (see below). Setting this parameter tells the function how many cells to include in each block. This is a tuple of width and height. Your SIFT implementation, which had a total of 16 cells, was equivalent to setting this argument to (4,4). pixels_per_cell: This controls the width and height of each cell (in pixels). Like cells_per_block, it is a tuple. In your SIFT implementation, each cell was 4 pixels by 4 pixels, so (4,4). feature_vector: This argument is a boolean which tells the function what shape it should use for the return array. When set to True, it returns one long array. We recommend setting it to True and reshaping the result rather than working with the default value, as it is very confusing. It is up to you to choose your cells per block and pixels per cell. Choose values that generate reasonably-sized feature vectors and produce good classification results. For each cell, HOG produces a histogram (feature vector) of length 9. We want one feature vector per block. To do this we can append the histograms for each cell together. Let's say you set cells_per_block = (z,z). This means that the length of your feature vector for the block will be z*z*9. With feature_vector=True, hog() will return one long np array containing every cell histogram concatenated end to end. We want to break this up into a list of (z*z*9) block feature vectors. We can do this using a really nifty numpy function. When using np.reshape, you can set the length of one dimension to -1, which tells numpy to make this dimension as big as it needs to be to accomodate to reshape all of the data based on the other dimensions. So if we want to break our long np array (long_boi) into rows of z*z*9 feature vectors we can use small_bois = long_boi.reshape(-1, z*z*9). The number of feature vectors that come from this reshape is dependent on the size of the image you give to hog(). It will fit as many blocks as it can on the image. You can choose to resize (or crop) each image to a consistent size (therefore creating the same number of feature vectors per image), or you can find feature vectors in the original sized image. ONE MORE THING If we returned all the features we found as our vocabulary, we would have an absolutely massive vocabulary. That would make matching inefficient AND inaccurate! So we use K Means clustering to find a much smaller (vocab_size) number of representative points. We recommend using sklearn.cluster.KMeans to do this. Note that this can take a VERY LONG TIME to complete (upwards of ten minutes for large numbers of features and large max_iter), so set the max_iter argument to something low (we used 100) and be patient. You may also find success setting the "tol" argument (see documentation for details) ''' #TODO: Implement this function! # cluster_SIFT_features = [] # sift = cv2.xfeatures2d.SIFT_create() # for image_path in tqdm(image_paths, desc="Imaging-SIFT"): # image = cv2.imread(image_path) # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # locations, SIFT_features = sift.detectAndCompute(gray, None) # temp = SIFT_features.tolist() # cluster_SIFT_features += temp # cluster_SIFT_features = random.sample(cluster_SIFT_features, 400 * 3) # kmeans = KMeans(n_clusters=vocab_size, max_iter=100).fit(cluster_SIFT_features) # cluster_centers = kmeans.cluster_centers_ # return np.array(cluster_centers) bag_of_features = [] print("Extract SIFT features") #pdb.set_trace() for path in tqdm(image_paths, desc='build_vocabulary'): img = np.asarray(Image.open(path), dtype='float32') frames, descriptors = dsift(img, step=[5, 5], fast=True) bag_of_features.append(descriptors) bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') #pdb.set_trace() print("Compute vocab") start_time = time() vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") end_time = time() print("It takes ", (start_time - end_time), " to compute vocab.") return vocab
def get_bags_of_words(image_paths): ''' This function should take in a list of image paths and calculate a bag of words histogram for each image, then return those histograms in an array. Inputs: image_paths: A Python list of strings, where each string is a complete path to one image on the disk. Outputs: An nxd numpy matrix, where n is the number of images in image_paths and d is size of the histogram built for each image. Use the same hog function to extract feature vectors as before (see build_vocabulary). It is important that you use the same hog settings for both build_vocabulary and get_bags_of_words! Otherwise, you will end up with different feature representations between your vocab and your test images, and you won't be able to match anything at all! After getting the feature vectors for an image, you will build up a histogram that represents what words are contained within the image. For each feature, find the closest vocab word, then add 1 to the histogram at the index of that word. For example, if the closest vector in the vocab is the 103rd word, then you should add 1 to the 103rd histogram bin. Your histogram should have as many bins as there are vocabulary words. Suggested functions: scipy.spatial.distance.cdist, np.argsort, np.linalg.norm, skimage.feature.hog ''' # vocab = np.load('vocab.npy') # print('Loaded vocab from file.') # #TODO: Implement this function! # vocab_size = len(image_paths) # tree = KDTree(vocab) # cluster_SIFT_features = [] # sift = cv2.xfeatures2d.SIFT_create() # for image_path in tqdm(image_paths, desc='SIFT'): # image_bag = [0] * vocab_size # image = cv2.imread(image_path) # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # locations, SIFT_features = sift.detectAndCompute(gray, None) # temp = SIFT_features.tolist() # nearest_dist, nearest_ind = tree.query(temp, k=1) # for index in nearest_ind: # image_bag[int(index)] += 1 # cluster_SIFT_features.append(image_bag) # return np.array(cluster_SIFT_features) with open('vocab.pkl', 'rb') as v: vocab = pickle.load(v) image_feats = np.zeros((len(image_paths), len(vocab))) for i, path in tqdm(enumerate(image_paths), desc='get_bags_of_words'): image = np.asarray(Image.open(path), dtype='float32') frames, descriptors = dsift(image, step=[9, 9], fast=True) dist = distance.cdist(vocab, descriptors, 'euclidean') mdist = np.argmin(dist, axis=0) histo, bins = np.histogram(mdist, range(len(vocab) + 1)) if np.linalg.norm(histo) == 0: image_feats[i, :] = histo else: image_feats[i, :] = histo / np.linalg.norm(histo) return image_feats
def build_vocabulary(image_paths, vocab_size): """ This function will sample SIFT descriptors from the training images, cluster them with kmeans, and then return the cluster centers. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html - frames is a N x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). - descriptors is a N x 128 matrix of SIFT features Note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - cluster_centers = vlfeat.kmeans.kmeans(X, K) test_image_feats - X is a N x d numpy array of sampled SIFT features, where N is the number of features sampled. N should be pretty large! - K is the number of clusters desired (vocab_size) cluster_centers is a K x d matrix of cluster centers. This is your vocabulary. Args: - image_paths: list of image paths. - vocab_size: size of vocabulary Returns: - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a cluster center / visual word """ # Load images from the training set. To save computation time, you don't # necessarily need to sample from all images, although it would be better # to do so. You can randomly sample the descriptors from each image to save # memory and speed up the clustering. Or you can simply call vl_dsift with # a large step size here, but a smaller step size in get_bags_of_sifts. # # For each loaded image, get some SIFT features. You don't have to get as # many SIFT features as you will in get_bags_of_sift, because you're only # trying to get a representative sample here. # # Once you have tens of thousands of SIFT features from many training # images, cluster them with kmeans. The resulting centroids are now your # visual word vocabulary. ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# dim = 128 # length of the SIFT descriptors that you are going to compute. vocab = np.zeros((vocab_size,dim)) # intialization of vocab bag_of_features = [] for path in image_paths: img = np.asarray(load_image_gray(path),dtype='float32') # loading grayscale image and converting it to numpy array frames, descriptors = dsift(img, step=[10,10], fast=True) #SIFT descriptor using step size of 10 and fast true bag_of_features.append(descriptors) bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') #list into an array print("Compute vocab") start_time = time.time() vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") # using kmeans for clusters center end_time = time.time() print("It takes ", (end_time - start_time), " to compute vocab.") ############################################################################# # END OF YOUR CODE # ############################################################################# return vocab
def get_bags_of_sifts(image_paths, vocab_filename): """ This feature representation is described in the handout, lecture materials, and Szeliski chapter 14. You will want to construct SIFT features here in the same way you did in build_vocabulary() (except for possibly changing the sampling rate) and then assign each local feature to its nearest cluster center and build a histogram indicating how many times each cluster was used. Don't forget to normalize the histogram, or else a larger image with more SIFT features will look very different from a smaller version of the same image. Useful functions: - Use load_image(path) to load RGB images and load_image_gray(path) to load grayscale images - frames, descriptors = vlfeat.sift.dsift(img) http://www.vlfeat.org/matlab/vl_dsift.html frames is a M x 2 matrix of locations, which can be thrown away here (but possibly used for extra credit in get_bags_of_sifts if you're making a "spatial pyramid"). descriptors is a M x 128 matrix of SIFT features note: there are step, bin size, and smoothing parameters you can manipulate for dsift(). We recommend debugging with the 'fast' parameter. This approximate version of SIFT is about 20 times faster to compute. Also, be sure not to use the default value of step size. It will be very slow and you'll see relatively little performance gain from extremely dense sampling. You are welcome to use your own SIFT feature code! It will probably be slower, though. - assignments = vlfeat.kmeans.kmeans_quantize(data, vocab) finds the cluster assigments for features in data - data is a M x d matrix of image features - vocab is the vocab_size x d matrix of cluster centers (vocabulary) - assignments is a Mx1 array of assignments of feature vectors to nearest cluster centers, each element is an integer in [0, vocab_size) Args: - image_paths: paths to N images - vocab_filename: Path to the precomputed vocabulary. This function assumes that vocab_filename exists and contains an vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid or visual word. This ndarray is saved to disk rather than passed in as a parameter to avoid recomputing the vocabulary every run. Returns: - image_feats: N x d matrix, where d is the dimensionality of the feature representation. In this case, d will equal the number of clusters or equivalently the number of entries in each image's histogram (vocab_size) below. """ ############################################################################# # TODO: YOUR CODE HERE # ############################################################################# # load vocabulary with open(vocab_filename, 'rb') as f: vocab = pickle.load(f) # dummy features variable feats = [] start_time = time.time() print("Construct bags of sifts...") for path in image_paths: img = np.asarray(load_image_gray(path),dtype='float32') # reading the image frames, descriptors = dsift(img, step=[5,5], fast=True) # SIFT descriptor with step size 5 dist = distance.cdist(descriptors,vocab, metric='euclidean')# euclidean distance calcualtion from each clusster center closest_vocab = np.argsort(dist,axis=1)[:,0] # sorting the index of distance ind ,count = np.unique(closest_vocab,return_counts=True) # finding unique values histogram = np.zeros(len(vocab)) histogram[ind] += count histogram = [float(i)/sum(histogram) for i in histogram] # Normalizing histogram feats.append(histogram) feats = np.asarray(feats) # List to array end_time = time.time() print("It takes ", (end_time - start_time), " to construct bags of sifts.") ############################################################################# # END OF YOUR CODE # ############################################################################# return feats
# or equivalently the number of entries in each image's histogram. # # You will want to construct SIFT features here in the same way you # # did in build_vocabulary.m (except for possibly changing the sampling # # rate) and then assign each local feature to its nearest cluster center # # and build a histogram indicating how many times each cluster was used. # # Don't forget to normalize the histogram, or else a larger image with more# # SIFT features will look very different from a smaller version of the same# # image. # ############################################################################ with open('vocab.pkl', 'rb') as f: vocab = pickle.load(f) vocab_size = len(vocab) len_img = len(image_paths) image_feats = np.zeros((len_img, vocab_size)) for idx, path in enumerate(image_paths): img = np.asarray(Image.open(path) , dtype='float32') frames, descriptors = dsift(img, step = step, fast=True) d = distance.cdist(vocab, descriptors, 'euclidean') nn_dist = np.argmin(d, axis=0) h, bins = np.histogram(nn_dist, bins=range(0,vocab_size+1)) norm = np.linalg.norm(h, ord=1) if norm==0: image_feats[idx,:] = h else: image_feats[idx,:] = h/norm ############################################################################# # END OF YOUR CODE # ############################################################################# return image_feats