Example #1
0
def get_bags_of_sifts(image_paths):

    with open('vocab.pkl', 'rb') as handle:
        vocab = pickle.load(handle)

    image_feats = []

    start_time = time()
    print("Construct bags of sifts...")

    for path in image_paths:
        img = np.asarray(Image.open(path), dtype='float32')
        frames, descriptors = dsift(img, step=[1, 1], fast=True)
        dist = distance.cdist(vocab, descriptors, metric='euclidean')
        idx = np.argmin(dist, axis=0)
        hist, bin_edges = np.histogram(idx, bins=len(vocab))
        hist_norm = [float(i) / sum(hist) for i in hist]

        image_feats.append(hist_norm)

    image_feats = np.asarray(image_feats)

    end_time = time()
    print("It takes ", (start_time - end_time), " to construct bags of sifts.")

    return image_feats
Example #2
0
def build_vocabulary(image_paths, vocab_size, step):
    ##################################################################################
    # TODO:                                                                          #
    # Load images from the training set. To save computation time, you don't         #
    # necessarily need to sample from all images, although it would be better        #
    # to do so. You can randomly sample the descriptors from each image to save      #
    # memory and speed up the clustering. Or you can simply call vl_dsift with       #
    # a large step size here.                                                        #
    #                                                                                #
    # For each loaded image, get some SIFT features. You don't have to get as        #
    # many SIFT features as you will in get_bags_of_sift.py, because you're only     #
    # trying to get a representative sample here.                                    #
    #                                                                                #
    # Once you have tens of thousands of SIFT features from many training            #
    # images, cluster them with kmeans. The resulting centroids are now your         #
    # visual word vocabulary.                                                        #
    ##################################################################################
    ##################################################################################
    # NOTE: Some useful functions                                                    #
    # This function will sample SIFT descriptors from the training images,           #
    # cluster them with kmeans, and then return the cluster centers.                 #
    #                                                                                #
    # Function : dsift()                                                             #
    # SIFT_features is a N x 128 matrix of SIFT features                             #
    # There are step, bin size, and smoothing parameters you can                     #
    # manipulate for dsift(). We recommend debugging with the 'fast'                 #
    # parameter. This approximate version of SIFT is about 20 times faster to        #
    # compute. Also, be sure not to use the default value of step size. It will      #
    # be very slow and you'll see relatively little performance gain from            #
    # extremely dense sampling. You are welcome to use your own SIFT feature.        #
    #                                                                                #
    # Function : kmeans(X, K)                                                        #
    # X is a M x d matrix of sampled SIFT features, where M is the number of         #
    # features sampled. M should be pretty large!                                    #
    # K is the number of clusters desired (vocab_size)                               #
    # centers is a d x K matrix of cluster centroids.                                #
    #                                                                                #
    # NOTE:                                                                          #
    #   e.g. 1. dsift(img, step=[?,?], fast=True)                                    #
    #        2. kmeans( ? , vocab_size)                                              #
    #                                                                                #
    # ################################################################################

    bag_of_features = []
    print("Extract SIFT features")
    for path in image_paths:
        img = np.asarray(Image.open(path), dtype='float32')
        frames, descriptors = dsift(img, step, fast=True)
        bag_of_features.append(descriptors)

    bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32')
    print("Compute vocab")
    start_t = time()
    vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS")
    end_t = time()
    print("It takes ", (start_t - end_t), " to compute vocab.")
    ##################################################################################
    #                                END OF YOUR CODE                                #
    ##################################################################################
    return vocab
Example #3
0
def test_dsift_sift(window_size):
    bin_size = 4
    magnif = 3
    scale = bin_size / magnif
    img_smooth = gaussian(img, sigma=sqrt(scale**2 - 0.25))
    f, d = dsift(img_smooth,
                 size=bin_size,
                 step=10,
                 window_size=window_size,
                 float_descriptors=True)
    num_keypoints = f.shape[0]
    f_ = np.column_stack([
        f,
        np.ones(shape=(num_keypoints, )) * scale,
        np.zeros(shape=(num_keypoints, ))
    ])
    f_, d_ = sift(img,
                  magnification=magnif,
                  frames=f_,
                  first_octave=-1,
                  n_levels=5,
                  compute_descriptor=True,
                  float_descriptors=True,
                  window_size=window_size)
    err = np.std(d - d_) / np.std(d)

    assert err < 0.1
Example #4
0
def image_class(images, features):
    image_feats = []
    print("Construct bags of sifts...")

    for key, value in tqdm(images.items()):
        empty = [0 for i in range(0, len(features))]

        for img in value:
            # orb = cv2.ORB_create()
            # orb = cv2.xfeatures2d.SIFT_create()
            # keypoints, descriptors = orb.detectAndCompute(img, None)
            _, descriptors = dsift(img, step=[5, 5], fast=True)
            if descriptors is not None:
                dist = distance.cdist(features,
                                      descriptors,
                                      metric='euclidean')

                idx = np.argmin(dist, axis=0)
                hist, bin_edges = np.histogram(idx, bins=len(features))
                hist_norm = [float(i) / sum(hist) for i in hist]
                image_feats.append(hist_norm)
            else:
                print("NONE")
                image_feats.append(empty)

    image_feats = np.asarray(image_feats)
    return image_feats
Example #5
0
def test_dsift_steps():
    i = half_img.copy()
    # Step 3 in Y-Direction, 4 in X-Direction
    frames, descriptors = dsift(i, step=[3, 4])

    assert frames.shape[0] == 10416
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 8.5], [4.5, 12.5]],
                    rtol=1e-3)
Example #6
0
def test_dsift_fast():
    i = half_img.copy()
    frames, descriptors = dsift(i, fast=True)

    assert frames.shape[0] == 124241
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, -3:], [61, 45, 60], rtol=1e-3)
Example #7
0
def test_dsift_windowsize():
    i = half_img.copy()
    frames, descriptors = dsift(i, window_size=3)

    assert frames.shape[0] == 124241
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, -3:], [74, 55, 71], rtol=1e-3)
Example #8
0
def test_dsift_windowsize():
    frames, descriptors = dsift(img, window_size=3)

    assert frames.shape[0] == 253009
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, :10], [99, 0, 0, 0, 0, 0, 157, 24, 52, 0],
                    rtol=1e-3)
Example #9
0
def test_dsift_steps():
    # Step 3 in Y-Direction, 4 in X-Direction
    frames, descriptors = dsift(img, step=[3, 4])

    assert frames.shape[0] == 21168
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 8.5], [4.5, 12.5]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, :10], [99, 0, 0, 0, 0, 0, 150, 24, 56, 0])
Example #10
0
def test_dsift_windowsize():
    i = half_img.copy()
    frames, descriptors = dsift(i, window_size=3)

    assert frames.shape[0] == 124241
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, -3:], [74, 55, 71],
                    rtol=1e-3)
Example #11
0
def test_dsift_fast():
    i = half_img.copy()
    frames, descriptors = dsift(i, fast=True)

    assert frames.shape[0] == 124241
    assert_allclose(frames[:3], [[4.5, 4.5], [4.5, 5.5], [4.5, 6.5]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, -3:], [61, 45, 60],
                    rtol=1e-3)
Example #12
0
def get_bags_of_sifts(image_paths):
    ############################################################################
    # TODO:                                                                    #
    # This function assumes that 'vocab.pkl' exists and contains an N x 128    #
    # matrix 'vocab' where each row is a kmeans centroid or visual word. This  #
    # matrix is saved to disk rather than passed in a parameter to avoid       #
    # recomputing the vocabulary every time at significant expense.            #

    # image_feats is an N x d matrix, where d is the dimensionality of the     #
    # feature representation. In this case, d will equal the number of clusters#
    # or equivalently the number of entries in each image's histogram.         #

    # You will want to construct SIFT features here in the same way you        #
    # did in build_vocabulary.m (except for possibly changing the sampling     #
    # rate) and then assign each local feature to its nearest cluster center   #
    # and build a histogram indicating how many times each cluster was used.   #
    # Don't forget to normalize the histogram, or else a larger image with more#
    # SIFT features will look very different from a smaller version of the same#
    # image.                                                                   #
    ############################################################################
    '''
    Input : 
        image_paths : a list(N) of training images
    Output : 
        image_feats : (N, d) feature, each row represent a feature of an image
    '''

    with open('vocab.pkl', 'rb') as handle:
        vocab = pickle.load(handle)

    image_feats = []

    start_time = time()
    print("Construct bags of sifts...")

    for path in image_paths:
        img = np.asarray(Image.open(path), dtype='float32')
        frames, descriptors = dsift(img, step=[1, 1], fast=True)
        dist = distance.cdist(vocab, descriptors, metric='euclidean')
        idx = np.argmin(dist, axis=0)
        hist, bin_edges = np.histogram(idx, bins=len(vocab))
        hist_norm = [float(i) / sum(hist) for i in hist]

        image_feats.append(hist_norm)

    image_feats = np.asarray(image_feats)

    end_time = time()
    print("It takes ", (start_time - end_time), " to construct bags of sifts.")

    #############################################################################
    #                                END OF YOUR CODE                           #
    #############################################################################

    return image_feats
Example #13
0
def test_dsift_norm():
    i = half_img.copy()
    frames, descriptors = dsift(i, norm=True)

    assert frames.shape[-1] == 3
    assert frames.shape[0] == 124241
    assert_allclose(frames[:3], [[4.5, 4.5, 1.6537], [4.5, 5.5, 1.7556],
                                 [4.5, 6.5, 1.8581]],
                    rtol=1e-3)
    assert_allclose(descriptors[0, -3:], [65, 48, 62],
                    rtol=1e-3)
Example #14
0
def test_dsift_norm():
    frames, descriptors = dsift(img, norm=True)

    assert frames.shape[-1] == 3
    assert frames.shape[0] == 253009
    print(frames)
    assert_allclose(
        frames[:3],
        [[4.5, 4.5, 0.2953], [4.5, 5.5, 0.2471], [4.5, 6.5, 0.2115]],
        rtol=1e-3)
    assert_allclose(descriptors[0, :10], [99, 0, 0, 0, 0, 0, 150, 24, 56, 0],
                    rtol=1e-3)
Example #15
0
def get_bags_of_sifts(image_paths):
    ############################################################################
    # TODO:                                                                    #
    # This function assumes that 'vocab.pkl' exists and contains an N x 128    #
    # matrix 'vocab' where each row is a kmeans centroid or visual word. This  #
    # matrix is saved to disk rather than passed in a parameter to avoid       #
    # recomputing the vocabulary every time at significant expense.            #
                                                                    
    # image_feats is an N x d matrix, where d is the dimensionality of the     #
    # feature representation. In this case, d will equal the number of clusters#
    # or equivalently the number of entries in each image's histogram.         #
    
    # You will want to construct SIFT features here in the same way you        #
    # did in build_vocabulary.m (except for possibly changing the sampling     #
    # rate) and then assign each local feature to its nearest cluster center   #
    # and build a histogram indicating how many times each cluster was used.   #
    # Don't forget to normalize the histogram, or else a larger image with more#
    # SIFT features will look very different from a smaller version of the same#
    # image.                                                                   #
    ############################################################################
    '''
    Input : 
        image_paths : a list(N) of training images
    Output : 
        image_feats : (N, d) feature, each row represent a feature of an image
    '''
    # load vocab.pkl
    with open('vocab_400.pkl','rb') as f:
        vocab = pickle.load(f)
    centroid_num = vocab.shape[0];
    
    # initial output
    image_feats = np.zeros([len(image_paths),centroid_num])
    
    for idx,path in enumerate(image_paths):
        img = np.asarray(Image.open(path),dtype='float32');
        frames, descriptors = dsift(img, step=[5,5], fast=True);
        dist = distance.cdist(vocab, descriptors, 'euclidean');
        category_result = np.argmin(dist,axis=0);
        hist_value, bins = np.histogram(category_result,bins = range(centroid_num+1));    # range(0,centroid_num)
        normalize = np.linalg.norm(hist_value,ord=1,axis=0);
        if normalize == 0:
            image_feats[idx,:] = hist_value;
        else:
            image_feats[idx,:] = hist_value / normalize;
            
        
    
    #############################################################################
    #                                END OF YOUR CODE                           #
    #############################################################################
    return image_feats
Example #16
0
def create_sift_discription(train_data):

    description_bag = []
    sift_data_dic = {}
    
    for key, value in train_data.items():
        temp_list = []
        for i in range(len(value)):
            kp, des = dsift(value[i][1], step=[5, 5], fast=True)
            description_bag.extend(des)
            temp_list.append(des)
        sift_data_dic[key] = temp_list
    return description_bag, sift_data_dic
Example #17
0
def get_bags_of_sifts(image_paths):
    ############################################################################
    # TODO:                                                                    #
    # This function assumes that 'vocab.pkl' exists and contains an N x 128    #
    # matrix 'vocab' where each row is a kmeans centroid or visual word. This  #
    # matrix is saved to disk rather than passed in a parameter to avoid       #
    # recomputing the vocabulary every time at significant expense.            #

    # image_feats is an N x d matrix, where d is the dimensionality of the     #
    # feature representation. In this case, d will equal the number of clusters#
    # or equivalently the number of entries in each image's histogram.         #

    # You will want to construct SIFT features here in the same way you        #
    # did in build_vocabulary.m (except for possibly changing the sampling     #
    # rate) and then assign each local feature to its nearest cluster center   #
    # and build a histogram indicating how many times each cluster was used.   #
    # Don't forget to normalize the histogram, or else a larger image with more#
    # SIFT features will look very different from a smaller version of the same#
    # image.                                                                   #
    ############################################################################
    '''
    Input : 
        image_paths : a list(N) of training images
    Output : 
        image_feats : (N, d) feature, each row represent a feature of an image
    '''

    image_feats = []
    vocab = pickle.load(open('vocab.pkl', 'rb'))

    for image_path in image_paths:
        img = np.asarray(Image.open(image_path), dtype='float32')
        frames, descriptors = dsift(img, step=[5, 5], fast=True)
        distance_matrix = distance.cdist(descriptors, vocab, 'euclidean')
        feature_idx = np.argmin(distance_matrix, axis=1)
        unique, counts = np.unique(feature_idx, return_counts=True)
        counter = dict(zip(unique, counts))

        histogram = np.zeros(vocab.shape[0])
        for idx, count in counter.items():
            histogram[idx] = count
        histogram = histogram / histogram.sum()

        image_feats.append(histogram)
        print(image_path)
    image_feats = np.asarray(image_feats)

    #############################################################################
    #                                END OF YOUR CODE                           #
    #############################################################################
    return image_feats
Example #18
0
def test_dsift_slow_fast():
    # bin size in pixels
    bin_size = 4
    # bin size / keypoint scale
    magnif = 3
    scale = bin_size / magnif
    window_size = 5

    img_smooth = gaussian(img, sigma=sqrt(scale**2 - 0.25))
    _, d = dsift(img_smooth,
                 size=bin_size,
                 step=10,
                 window_size=window_size,
                 float_descriptors=True)
    _, d_ = dsift(img_smooth,
                  size=bin_size,
                  step=10,
                  window_size=window_size,
                  float_descriptors=True,
                  fast=True)
    err = np.std(d_ - d) / np.std(d)

    assert err < 0.1
Example #19
0
def build_vocabulary(image_paths, vocab_size):

    feature_bag = []
    
    print("SIFT features extracting")
    for image_path in image_paths:
        image = np.asarray(Image.open(image_path),dtype='float32')
        frames, descriptors = dsift(image, step=[5,5], fast=True)
        feature_bag.append(descriptors)
    feature_bag = np.concatenate(feature_bag, axis=0).astype('float32')
    print("Computing vocabulary")
    vocabulary = kmeans(feature_bag, vocab_size, initialization="PLUSPLUS")        

    return vocabulary
Example #20
0
def get_bags_of_sifts(image_paths):
    ############################################################################
    # TODO:                                                                    #
    # This function assumes that 'vocab.pkl' exists and contains an N x 128    #
    # matrix 'vocab' where each row is a kmeans centroid or visual word. This  #
    # matrix is saved to disk rather than passed in a parameter to avoid       #
    # recomputing the vocabulary every time at significant expense.            #

    # image_feats is an N x d matrix, where d is the dimensionality of the     #
    # feature representation. In this case, d will equal the number of clusters#
    # or equivalently the number of entries in each image's histogram.         #

    # You will want to construct SIFT features here in the same way you        #
    # did in build_vocabulary.m (except for possibly changing the sampling     #
    # rate) and then assign each local feature to its nearest cluster center   #
    # and build a histogram indicating how many times each cluster was used.   #
    # Don't forget to normalize the histogram, or else a larger image with more#
    # SIFT features will look very different from a smaller version of the same#
    # image.                                                                   #
    ############################################################################
    '''
    Input :
        image_paths : a list(N) of training images
    Output :
        image_feats : (N, d) feature, each row represent a feature of an image
    '''
    image_feats = None
    with open('vocab.pkl', 'rb') as voc:
        vocal_feats = pickle.load(voc)

    record = np.zeros(vocal_feats.shape[0])
    image_feats = np.zeros((len(image_paths), vocal_feats.shape[0]))

    for i, path in enumerate(image_paths):
        img = np.asarray(Image.open(path),dtype='float32')
        frames, descriptors = dsift(img, step=[5,5], fast=True)
        L2_dis = distance.cdist(vocal_feats, descriptors, 'euclidean')
        for j in range(descriptors.shape[0]):
            min_index = np.argmin(L2_dis[:, j])
            record[min_index] = record[min_index] + 1
        average = np.mean(record)
        deviate = np.std(record)
        img_normal = (record - average) / deviate
        image_feats[i, :] = img_normal
        record = np.zeros(vocal_feats.shape[0])
    #############################################################################
    #                                END OF YOUR CODE                           #
    #############################################################################
    return image_feats
Example #21
0
def build_vocabulary(image_paths, vocab_size):

    bag_of_features = []
    print("Extract SIFT features")
    for path in image_paths:
        print(path)
        img = np.asarray(Image.open(path),dtype='float32')
        frames, descriptors = dsift(img, step=[5,5], fast=True)
        bag_of_features.append(descriptors)
    bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32')
    print("Compute vocab")
    start_time = time()
    vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS")        
    end_time = time()
    print("It takes ", (start_time - end_time), " to compute vocab.")

    return vocab
Example #22
0
def get_bags_of_sifts(image_paths):

    with open('vocab.pkl', 'rb') as vocab:
        vocabulary = pickle.load(vocab)
        image_feature = np.zeros((len(image_paths),len(vocabulary)))
        
    for i, path in enumerate(image_paths):
        
        image = np.asarray(Image.open(path), dtype = 'float32')
        frames, descriptors = dsift(image, step=[9,9], fast=True)
        
        dist = distance.cdist(vocabulary, descriptors, 'euclidean')
        mdist = np.argmin(dist, axis = 0)
        histo, bins = np.histogram(mdist, range(len(vocabulary)+1))
        if np.linalg.norm(histo) != 0:
            image_feature[i, :] = histo / np.linalg.norm(histo)        
        elif np.linalg.norm(histo) == 0:
            image_feature[i, :] = histo
        else:
            print("something wrong, check the np")
            
    return image_feature
Example #23
0
def sift_features(images, size):
    print("feature number", size)

    bag_of_features = []
    print("Extract SIFT features...")
    for key, value in tqdm(images.items()):
        for img in value:
            # orb = cv2.xfeatures2d.SIFT_create(500)
            # orb = cv2.ORB_create()
            # keypoints, descriptors = orb.detectAndCompute(img, None)
            _, descriptors = dsift(img, step=[5, 5], fast=True)
            if descriptors is not None:
                for des in descriptors:
                    bag_of_features.append(des)

    print("Compute kmeans in dimensions:", size)

    km = kmeans(np.array(bag_of_features).astype('float32'),
                size,
                initialization="PLUSPLUS")

    return km
Example #24
0
def get_bags_of_sifts(image_paths):

    image_feats = []
    vocab = pickle.load(open('vocab.pkl', 'rb'))

    for image_path in image_paths:
        img = np.asarray(Image.open(image_path), dtype='float32')
        frames, descriptors = dsift(img, step=[5, 5], fast=True)
        distance_matrix = distance.cdist(descriptors, vocab, 'euclidean')
        feature_idx = np.argmin(distance_matrix, axis=1)
        unique, counts = np.unique(feature_idx, return_counts=True)
        counter = dict(zip(unique, counts))

        histogram = np.zeros(vocab.shape[0])
        for idx, count in counter.items():
            histogram[idx] = count
        histogram = histogram / histogram.sum()

        image_feats.append(histogram)
        print(image_path)
    image_feats = np.asarray(image_feats)

    return image_feats
Example #25
0
def test_dsift_float_descriptors():
    _, descriptors = dsift(img, float_descriptors=True)
    assert descriptors.dtype == np.float32
Example #26
0
def test_dsift_float_descriptors():
    i = img.copy()
    frames, descriptors = dsift(i, float_descriptors=True)
    assert descriptors.dtype == np.float32
Example #27
0
def test_dsift_non_float_descriptors():
    i = img.copy()
    frames, descriptors = dsift(i, float_descriptors=False)
    assert descriptors.dtype == np.uint8
def build_vocabulary(image_paths, vocab_size):
    '''
    This function should sample HOG descriptors from the training images,
    cluster them with kmeans, and then return the cluster centers.
    Inputs:
        image_paths: a Python list of image path strings
         vocab_size: an integer indicating the number of words desired for the
                     bag of words vocab set
    Outputs:
        a vocab_size x (z*z*9) (see below) array which contains the cluster
        centers that result from the K Means clustering.
    You'll need to generate HOG features using the skimage.feature.hog() function.
    The documentation is available here:
    http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog
    However, the documentation is a bit confusing, so we will highlight some
    important arguments to consider:
        cells_per_block: The hog function breaks the image into evenly-sized
            blocks, which are further broken down into cells, each made of
            pixels_per_cell pixels (see below). Setting this parameter tells the
            function how many cells to include in each block. This is a tuple of
            width and height. Your SIFT implementation, which had a total of
            16 cells, was equivalent to setting this argument to (4,4).
        pixels_per_cell: This controls the width and height of each cell
            (in pixels). Like cells_per_block, it is a tuple. In your SIFT
            implementation, each cell was 4 pixels by 4 pixels, so (4,4).
        feature_vector: This argument is a boolean which tells the function
            what shape it should use for the return array. When set to True,
            it returns one long array. We recommend setting it to True and
            reshaping the result rather than working with the default value,
            as it is very confusing.
    It is up to you to choose your cells per block and pixels per cell. Choose
    values that generate reasonably-sized feature vectors and produce good
    classification results. For each cell, HOG produces a histogram (feature
    vector) of length 9. We want one feature vector per block. To do this we
    can append the histograms for each cell together. Let's say you set
    cells_per_block = (z,z). This means that the length of your feature vector
    for the block will be z*z*9.
    With feature_vector=True, hog() will return one long np array containing every
    cell histogram concatenated end to end. We want to break this up into a
    list of (z*z*9) block feature vectors. We can do this using a really nifty numpy
    function. When using np.reshape, you can set the length of one dimension to
    -1, which tells numpy to make this dimension as big as it needs to be to
    accomodate to reshape all of the data based on the other dimensions. So if
    we want to break our long np array (long_boi) into rows of z*z*9 feature
    vectors we can use small_bois = long_boi.reshape(-1, z*z*9).
    The number of feature vectors that come from this reshape is dependent on
    the size of the image you give to hog(). It will fit as many blocks as it
    can on the image. You can choose to resize (or crop) each image to a consistent size
    (therefore creating the same number of feature vectors per image), or you
    can find feature vectors in the original sized image.
    ONE MORE THING
    If we returned all the features we found as our vocabulary, we would have an
    absolutely massive vocabulary. That would make matching inefficient AND
    inaccurate! So we use K Means clustering to find a much smaller (vocab_size)
    number of representative points. We recommend using sklearn.cluster.KMeans
    to do this. Note that this can take a VERY LONG TIME to complete (upwards
    of ten minutes for large numbers of features and large max_iter), so set
    the max_iter argument to something low (we used 100) and be patient. You
    may also find success setting the "tol" argument (see documentation for
    details)
    '''

    #TODO: Implement this function!
    # cluster_SIFT_features = []
    # sift = cv2.xfeatures2d.SIFT_create()
    # for image_path in tqdm(image_paths, desc="Imaging-SIFT"):
    #     image = cv2.imread(image_path)
    #     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #     locations, SIFT_features = sift.detectAndCompute(gray, None)
    #     temp = SIFT_features.tolist()
    #     cluster_SIFT_features += temp
    # cluster_SIFT_features = random.sample(cluster_SIFT_features, 400 * 3)
    # kmeans = KMeans(n_clusters=vocab_size, max_iter=100).fit(cluster_SIFT_features)
    # cluster_centers = kmeans.cluster_centers_
    # return np.array(cluster_centers)

    bag_of_features = []

    print("Extract SIFT features")
    #pdb.set_trace()
    for path in tqdm(image_paths, desc='build_vocabulary'):
        img = np.asarray(Image.open(path), dtype='float32')
        frames, descriptors = dsift(img, step=[5, 5], fast=True)
        bag_of_features.append(descriptors)
    bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32')
    #pdb.set_trace()
    print("Compute vocab")
    start_time = time()
    vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS")
    end_time = time()
    print("It takes ", (start_time - end_time), " to compute vocab.")
    return vocab
def get_bags_of_words(image_paths):
    '''
    This function should take in a list of image paths and calculate a bag of
    words histogram for each image, then return those histograms in an array.
    Inputs:
        image_paths: A Python list of strings, where each string is a complete
                     path to one image on the disk.
    Outputs:
        An nxd numpy matrix, where n is the number of images in image_paths and
        d is size of the histogram built for each image.
    Use the same hog function to extract feature vectors as before (see
    build_vocabulary). It is important that you use the same hog settings for
    both build_vocabulary and get_bags_of_words! Otherwise, you will end up
    with different feature representations between your vocab and your test
    images, and you won't be able to match anything at all!
    After getting the feature vectors for an image, you will build up a
    histogram that represents what words are contained within the image.
    For each feature, find the closest vocab word, then add 1 to the histogram
    at the index of that word. For example, if the closest vector in the vocab
    is the 103rd word, then you should add 1 to the 103rd histogram bin. Your
    histogram should have as many bins as there are vocabulary words.
    Suggested functions: scipy.spatial.distance.cdist, np.argsort,
                         np.linalg.norm, skimage.feature.hog
    '''

    # vocab = np.load('vocab.npy')
    # print('Loaded vocab from file.')

    # #TODO: Implement this function!
    # vocab_size = len(image_paths)
    # tree = KDTree(vocab)
    # cluster_SIFT_features = []
    # sift = cv2.xfeatures2d.SIFT_create()
    # for image_path in tqdm(image_paths, desc='SIFT'):
    #     image_bag = [0] * vocab_size
    #     image = cv2.imread(image_path)
    #     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #     locations, SIFT_features = sift.detectAndCompute(gray, None)
    #     temp = SIFT_features.tolist()
    #     nearest_dist, nearest_ind = tree.query(temp, k=1)
    #     for index in nearest_ind:
    #         image_bag[int(index)] += 1
    #     cluster_SIFT_features.append(image_bag)
    # return np.array(cluster_SIFT_features)

    with open('vocab.pkl', 'rb') as v:
        vocab = pickle.load(v)
        image_feats = np.zeros((len(image_paths), len(vocab)))

    for i, path in tqdm(enumerate(image_paths), desc='get_bags_of_words'):

        image = np.asarray(Image.open(path), dtype='float32')
        frames, descriptors = dsift(image, step=[9, 9], fast=True)

        dist = distance.cdist(vocab, descriptors, 'euclidean')
        mdist = np.argmin(dist, axis=0)
        histo, bins = np.histogram(mdist, range(len(vocab) + 1))
        if np.linalg.norm(histo) == 0:
            image_feats[i, :] = histo
        else:
            image_feats[i, :] = histo / np.linalg.norm(histo)
    return image_feats
def build_vocabulary(image_paths, vocab_size):
  """
  This function will sample SIFT descriptors from the training images,
  cluster them with kmeans, and then return the cluster centers.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
        http://www.vlfeat.org/matlab/vl_dsift.html
          -  frames is a N x 2 matrix of locations, which can be thrown away
          here (but possibly used for extra credit in get_bags_of_sifts if
          you're making a "spatial pyramid").
          -  descriptors is a N x 128 matrix of SIFT features
        Note: there are step, bin size, and smoothing parameters you can
        manipulate for dsift(). We recommend debugging with the 'fast'
        parameter. This approximate version of SIFT is about 20 times faster to
        compute. Also, be sure not to use the default value of step size. It
        will be very slow and you'll see relatively little performance gain
        from extremely dense sampling. You are welcome to use your own SIFT
        feature code! It will probably be slower, though.
  -   cluster_centers = vlfeat.kmeans.kmeans(X, K)
          test_image_feats
            -  X is a N x d numpy array of sampled SIFT features, where N is
               the number of features sampled. N should be pretty large!
            -  K is the number of clusters desired (vocab_size)
               cluster_centers is a K x d matrix of cluster centers. This is
               your vocabulary.

  Args:
  -   image_paths: list of image paths.
  -   vocab_size: size of vocabulary

  Returns:
  -   vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a
      cluster center / visual word
  """
  # Load images from the training set. To save computation time, you don't
  # necessarily need to sample from all images, although it would be better
  # to do so. You can randomly sample the descriptors from each image to save
  # memory and speed up the clustering. Or you can simply call vl_dsift with
  # a large step size here, but a smaller step size in get_bags_of_sifts.
  #
  # For each loaded image, get some SIFT features. You don't have to get as
  # many SIFT features as you will in get_bags_of_sift, because you're only
  # trying to get a representative sample here.
  #
  # Once you have tens of thousands of SIFT features from many training
  # images, cluster them with kmeans. The resulting centroids are now your
  # visual word vocabulary.

  
  #############################################################################
  # TODO: YOUR CODE HERE                                                      #
  #############################################################################
    
  dim = 128      # length of the SIFT descriptors that you are going to compute.

  vocab = np.zeros((vocab_size,dim)) # intialization of vocab
  bag_of_features = [] 
    
    
  for path in image_paths:
        img = np.asarray(load_image_gray(path),dtype='float32') # loading grayscale image and converting it to numpy array
        frames, descriptors = dsift(img, step=[10,10], fast=True) #SIFT descriptor using step size of 10 and fast true 
        bag_of_features.append(descriptors)

  bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') #list into an array
    
    
  print("Compute vocab")
  start_time = time.time()
  vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") # using kmeans for clusters center       
  end_time = time.time()
  print("It takes ", (end_time - start_time), " to compute vocab.")
  
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return vocab
def get_bags_of_sifts(image_paths, vocab_filename):
  """
  This feature representation is described in the handout, lecture
  materials, and Szeliski chapter 14.
  You will want to construct SIFT features here in the same way you
  did in build_vocabulary() (except for possibly changing the sampling
  rate) and then assign each local feature to its nearest cluster center
  and build a histogram indicating how many times each cluster was used.
  Don't forget to normalize the histogram, or else a larger image with more
  SIFT features will look very different from a smaller version of the same
  image.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
          http://www.vlfeat.org/matlab/vl_dsift.html
        frames is a M x 2 matrix of locations, which can be thrown away here
          (but possibly used for extra credit in get_bags_of_sifts if you're
          making a "spatial pyramid").
        descriptors is a M x 128 matrix of SIFT features
          note: there are step, bin size, and smoothing parameters you can
          manipulate for dsift(). We recommend debugging with the 'fast'
          parameter. This approximate version of SIFT is about 20 times faster
          to compute. Also, be sure not to use the default value of step size.
          It will be very slow and you'll see relatively little performance
          gain from extremely dense sampling. You are welcome to use your own
          SIFT feature code! It will probably be slower, though.
  -   assignments = vlfeat.kmeans.kmeans_quantize(data, vocab)
          finds the cluster assigments for features in data
            -  data is a M x d matrix of image features
            -  vocab is the vocab_size x d matrix of cluster centers
            (vocabulary)
            -  assignments is a Mx1 array of assignments of feature vectors to
            nearest cluster centers, each element is an integer in
            [0, vocab_size)

  Args:
  -   image_paths: paths to N images
  -   vocab_filename: Path to the precomputed vocabulary.
          This function assumes that vocab_filename exists and contains an
          vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid
          or visual word. This ndarray is saved to disk rather than passed in
          as a parameter to avoid recomputing the vocabulary every run.

  Returns:
  -   image_feats: N x d matrix, where d is the dimensionality of the
          feature representation. In this case, d will equal the number of
          clusters or equivalently the number of entries in each image's
          histogram (vocab_size) below.
  """

  #############################################################################
  # TODO: YOUR CODE HERE                                                      #
  #############################################################################

  # load vocabulary
  with open(vocab_filename, 'rb') as f:
    vocab = pickle.load(f)

  # dummy features variable
  feats = []

  start_time = time.time()
  print("Construct bags of sifts...")

  for path in image_paths:
        img = np.asarray(load_image_gray(path),dtype='float32') # reading the image
        frames, descriptors = dsift(img, step=[5,5], fast=True) # SIFT descriptor with step size 5
        dist = distance.cdist(descriptors,vocab, metric='euclidean')# euclidean distance calcualtion from each clusster center 
        closest_vocab = np.argsort(dist,axis=1)[:,0] # sorting the index of distance
        ind ,count = np.unique(closest_vocab,return_counts=True) # finding unique values
        histogram = np.zeros(len(vocab)) 
        histogram[ind] += count 
        histogram = [float(i)/sum(histogram) for i in histogram] # Normalizing histogram
     
        feats.append(histogram) 

  feats = np.asarray(feats) # List to array

  end_time = time.time()
  print("It takes ", (end_time - start_time), " to construct bags of sifts.")





 

  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return feats
    # or equivalently the number of entries in each image's histogram.         #
    
    # You will want to construct SIFT features here in the same way you        #
    # did in build_vocabulary.m (except for possibly changing the sampling     #
    # rate) and then assign each local feature to its nearest cluster center   #
    # and build a histogram indicating how many times each cluster was used.   #
    # Don't forget to normalize the histogram, or else a larger image with more#
    # SIFT features will look very different from a smaller version of the same#
    # image.                                                                   #
    ############################################################################    
     with open('vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab)
    len_img = len(image_paths)
    image_feats = np.zeros((len_img, vocab_size))
    for idx, path in enumerate(image_paths):
        img = np.asarray(Image.open(path) , dtype='float32')
        frames, descriptors = dsift(img, step = step, fast=True)
        d = distance.cdist(vocab, descriptors, 'euclidean')
        nn_dist = np.argmin(d, axis=0)
        h, bins = np.histogram(nn_dist, bins=range(0,vocab_size+1))
        norm = np.linalg.norm(h, ord=1)
        if norm==0:
            image_feats[idx,:] = h
        else:
            image_feats[idx,:] = h/norm
    #############################################################################
    #                                END OF YOUR CODE                           #
    #############################################################################
    return image_feats