def get_bags_of_sifts(image_paths, vocab_filename):
    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable
    feats = []

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################

    for i in tqdm.trange(len(image_paths), desc='getting bags of SIFT'):
        img = load_image_gray(image_paths[i]).astype(DTYPE)
        _, descriptors = vlfeat.sift.dsift(img,
                                           step=4,
                                           fast=True,
                                           float_descriptors=True)
        d_norm = np.linalg.norm(descriptors, axis=1)
        idx_nonzero = np.nonzero(d_norm)
        d_norm = d_norm[idx_nonzero].astype(DTYPE)
        descriptors = descriptors[idx_nonzero].astype(DTYPE)
        descriptors /= d_norm[:, None]
        assignments = vlfeat.kmeans.kmeans_quantize(descriptors, vocab)
        feat, _ = np.histogram(assignments, bins=vocab.shape[0])
        feat = feat.astype('float32')
        feat /= np.linalg.norm(feat)
        feats.append(feat)

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
Example #2
0
def get_bags_of_sifts(image_paths, vocab_filename):

    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable
    feats = []

    for path in image_paths:
        image = load_image_gray(path)
        _, descriptors = vlfeat.sift.dsift(image, step=5, fast=True)
        descriptors = np.float32(descriptors)
        centers = vlfeat.kmeans.kmeans_quantize(descriptors, vocab)
        feature, _ = np.histogram(centers,
                                  bins=np.linspace(0,
                                                   len(vocab),
                                                   num=len(vocab) + 1))
        feature = (feature / np.linalg.norm(feature))**(0.3)
        feats.append(feature)

    feats = np.asarray(feats)
    feats = np.reshape(feats, (len(image_paths), len(vocab)))

    return feats
Example #3
0
def get_fisher_encoding(image_paths, stat_filename):
    with open(stat_filename, 'rb') as f:
        stats = pickle.load(f)

    means = stats[:, 0:128]
    covariances = stats[:, 128:256]
    priors = stats[:, 257]

    feats = []
    for i in range(len(image_paths)):
        image = load_image_gray(image_paths[i])
        [locations, SIFT_features] = vlfeat.sift.dsift(image.astype('float32'),
                                                       fast=True,
                                                       step=5,
                                                       bin=8)
        result = vlfeats.fisher.fisher(SIFT_features.astype('float32'),
                                       means,
                                       covariances,
                                       priors,
                                       Improved=True)
        feats.append(result)

    feats = np.array(feats)

    return feats
Example #4
0
def build_vocabulary(image_paths, vocab_size):

    dim = 128  # length of the SIFT descriptors that you are going to compute.
    image = load_image_gray(image_paths[0])
    vs = 20
    vb = 9
    _, X = vlfeat.sift.dsift(image, step=vs, size=vb, fast=True)

    for i in range(1, len(image_paths)):
        image = load_image_gray(image_paths[i])
        _, descriptors = vlfeat.sift.dsift(image, step=vs, size=vb, fast=True)
        X = np.vstack((X, descriptors))

    X = np.float32(X)
    vocab = vlfeat.kmeans.kmeans(X, vocab_size)

    return vocab
def build_vocabulary(image_paths, vocab_size):
  """
  This function will sample SIFT descriptors from the training images,
  cluster them with kmeans, and then return the cluster centers.



  Args:
  -   image_paths: list of image paths.
  -   vocab_size: size of vocabulary

  Returns:
  -   vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a
      cluster center / visual word
  """


  dim = 128      # length of the SIFT descriptors that you are going to compute.
  vocab = np.zeros((vocab_size,dim))



  N = 400
  StepSize = 10

  TotalImages = len(image_paths)

  for i in range(TotalImages):

    Image = load_image_gray(image_paths[i])

    Frames, Descriptors = vlfeat.sift.dsift(Image, fast = 1, step = StepSize)

    Descriptors = np.random.randint(0, high = Descriptors.shape[0] - 1, size = (400,128))


    if i == 0:

      SIFT = np.stack(Descriptors)

    else:

      SIFT = np.vstack((SIFT,Descriptors))

  SIFT = SIFT.astype(float)






  ClusterCenters = vlfeat.kmeans.kmeans(SIFT, vocab_size)

  vocab = ClusterCenters



  return vocab
 def parallel_func(i, image_paths, step, vocab, vocab_size):
     image = load_image_gray(image_paths[i])
     _, descriptors = vlfeat.sift.dsift(image, fast=True, step=step)
     assignments = vlfeat.kmeans.kmeans_quantize(
         descriptors.astype('float64'), vocab)
     bags_of_sifts = np.zeros((1, vocab_size))
     for assignment in assignments:
         bags_of_sifts[0, assignment] += 1
     return bags_of_sifts / np.linalg.norm(bags_of_sifts)
def get_bags_of_sifts(image_paths, vocab_filename):
  """


  Args:
  -   image_paths: paths to N images
  -   vocab_filename: Path to the precomputed vocabulary.
          This function assumes that vocab_filename exists and contains an
          vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid
          or visual word. This ndarray is saved to disk rather than passed in
          as a parameter to avoid recomputing the vocabulary every run.

  Returns:
  -   image_feats: N x d matrix, where d is the dimensionality of the
          feature representation. In this case, d will equal the number of
          clusters or equivalently the number of entries in each image's
          histogram (vocab_size) below.
  """
  # load vocabulary
  with open(vocab_filename, 'rb') as f:
    vocab = pickle.load(f)




  vocab_size = 200
  TotalImages = len(image_paths)
  StepSize = 3
  feats = np.zeros((TotalImages, vocab_size))



  for i in range(TotalImages):

    Image = load_image_gray(image_paths[i])

    Frames, Descriptors = vlfeat.sift.dsift(Image, fast = 1, step = StepSize)

    Descriptors = Descriptors.astype(float)

    assignments = vlfeat.kmeans.kmeans_quantize(Descriptors, vocab)

    AssignmentHist, edges = np.histogram(assignments, bins = vocab_size, density = True)

    AssignmentHist = np.asarray(AssignmentHist)


    feats[i, : ] = AssignmentHist




  return feats
Example #8
0
def get_tiny_images(image_paths):
    """
  This feature is inspired by the simple tiny images used as features in
  80 million tiny images: a large dataset for non-parametric object and
  scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
  Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
  pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

  To build a tiny image feature, simply resize the original image to a very
  small square resolution, e.g. 16x16. You can either resize the images to
  square while ignoring their aspect ratio or you can crop the center
  square portion out of each image. Making the tiny images zero mean and
  unit length (normalizing them) will increase performance modestly.

  Useful functions:
  -   cv2.resize
  -   use load_image(path) to load a RGB images and load_image_gray(path) to
      load grayscale images

  Args:
  -   image_paths: list of N elements containing image paths

  Returns:
  -   feats: N x d numpy array of resized and then vectorized tiny images
            e.g. if the images are resized to 16x16, d would be 256
  """
    # dummy feats variable
    feats = []

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################=
    for i in range(len(image_paths)):
        image = load_image_gray(image_paths[i])
        image = cv2.resize(image, (16, 16))

        ##print(image)

        Ir = image.flatten()
        Izm = Ir - np.mean(Ir)
        Iul = Izm / np.max(np.abs(Izm))
        feats.append(Iul)

    feats = np.array(feats)

    # raise NotImplementedError('`get_tiny_images` function in ' +
    #       '`student_code.py` needs to be implemented')

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
def transform_image(filepath, n_dim):
    original_img = load_image_gray(filepath)
    scale = n_dim / original_img.shape[0]
    label = rescale(original_img,
                    scale=scale,
                    mode='reflect',
                    multichannel=False)
    theta = generate_theta(n_dim)
    data = radon(label, theta=theta, circle=False)
    # data = torch.from_numpy(data)
    # label = torch.from_numpy(label)
    # theta = torch.from_numpy(theta)
    return data, label, theta
def get_tiny_images(image_paths):
  """
  This feature is inspired by the simple tiny images used as features in
  80 million tiny images: a large dataset for non-parametric object and
  scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
  Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
  pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/


  Args:
  -   image_paths: list of N elements containing image paths

  Returns:
  -   feats: N x d numpy array of resized and then vectorized tiny images
            e.g. if the images are resized to 16x16, d would be 256
  """


  TotalImages = len(image_paths)
  Resize = 16
  feats = np.zeros((TotalImages, Resize*Resize))

  for i in range(TotalImages):

    #taking out each individual image from the given image path and resizing.

    Image = load_image_gray(image_paths[i])
    ResizedImage = cv2.resize(Image,(Resize,Resize))

    # creating a feature from the resized image;

    Feature = np.reshape(ResizedImage,(1,256))


    # zero mean and unit length



    FeatureNew = (Feature - np.mean(Feature))/np.std(Feature)

    #print(np.linalg.norm(FeatureNew, ord = 1))



    feats[i,:] = FeatureNew





  return feats
def get_tiny_images(image_paths):
    """
  This feature is inspired by the simple tiny images used as features in
  80 million tiny images: a large dataset for non-parametric object and
  scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
  Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
  pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

  To build a tiny image feature, simply resize the original image to a very
  small square resolution, e.g. 16x16. You can either resize the images to
  square while ignoring their aspect ratio or you can crop the center
  square portion out of each image. Making the tiny images zero mean and
  unit length (normalizing them) will increase performance modestly.

  Useful functions:
  -   cv2.resize
  -   use load_image(path) to load a RGB images and load_image_gray(path) to
      load grayscale images

  Args:
  -   image_paths: list of N elements containing image paths

  Returns:
  -   feats: N x d numpy array of resized and then vectorized tiny images
            e.g. if the images are resized to 16x16, d would be 256
  """
    # dummy feats variable
    feats = []
    feats = np.zeros((len(image_paths), 256))
    for x, y in enumerate(image_paths):
        image1 = load_image_gray(y)
        image2 = cv2.resize(image1, (16, 16))
        image_mean = np.mean(image2)
        normalized_image = image2 / image_mean

        flat_image = np.ndarray.flatten(normalized_image)
        feats[x, :] = flat_image
    print(feats.shape)
    #print(len(feats[1,:]))

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
def get_tiny_images(image_paths):
  """
  This feature is inspired by the simple tiny images used as features in
  80 million tiny images: a large dataset for non-parametric object and
  scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
  Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
  pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

  To build a tiny image feature, simply resize the original image to a very
  small square resolution, e.g. 16x16. You can either resize the images to
  square while ignoring their aspect ratio or you can crop the center
  square portion out of each image. Making the tiny images zero mean and
  unit length (normalizing them) will increase performance modestly.

  Useful functions:
  -   cv2.resize
  -   use load_image(path) to load a RGB images and load_image_gray(path) to
      load grayscale images

  Args:
  -   image_paths: list of N elements containing image paths

  Returns:
  -   feats: N x d numpy array of resized and then vectorized tiny images
            e.g. if the images are resized to 16x16, d would be 256
  """
  # dummy feats variable
   
  

  #############################################################################
  # TODO: YOUR CODE HERE                                                      #
  #############################################################################
  h = 16 # hight
  w = 16 #width
  feats = np.zeros((len(image_paths), h*w)) # zero amtrix of shape (number of samples, 16x16)
  for i , path in enumerate(image_paths): #for all the images
    
    image = load_image_gray(path) #load graysclae image
    img_reshape = cv2.resize(image,(16,16)).flatten() #resize the image and converts intot a vector of size (1,256)
    image_normalized = (img_reshape - np.mean(img_reshape))/np.std(img_reshape) # Normalizing the vector
    feats[i,:] = image_normalized #saving the feature for each image

  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return feats
Example #13
0
def get_tiny_images(image_paths):
    """
    This feature is inspired by the simple tiny images used as features in
    80 million tiny images: a large dataset for non-parametric object and
    scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
    Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
    pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

    To build a tiny image feature, simply resize the original image to a very
    small square resolution, e.g. 16x16. You can either resize the images to
    square while ignoring their aspect ratio or you can crop the center
    square portion out of each image. Making the tiny images zero mean and
    unit length (normalizing them) will increase performance modestly.

    Useful functions:
    -   cv2.resize
    -   use load_image(path) to load a RGB images and load_image_gray(path) to
        load grayscale images

    Args:
    -   image_paths: list of N elements containing image paths

    Returns:
    -   feats: N x d numpy array of resized and then vectorized tiny images
              e.g. if the images are resized to 16x16, d would be 256
    """
    # dummy feats variable
    feats = []

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################

    for img_path in image_paths:
        img = load_image_gray(img_path)
        feat = cv2.resize(img, (24, 24),
                          interpolation=cv2.INTER_AREA).flatten()
        feat_zero_mean = feat - np.mean(feat)
        feat_unit_length = feat_zero_mean / np.linalg.norm(feat_zero_mean)
        feats.append(feat_unit_length)

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
def get_tiny_images(image_paths):
    """
  This feature is inspired by the simple tiny images used as features in
  80 million tiny images: a large dataset for non-parametric object and
  scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
  Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
  pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

  To build a tiny image feature, simply resize the original image to a very
  small square resolution, e.g. 16x16. You can either resize the images to
  square while ignoring their aspect ratio or you can crop the center
  square portion out of each image. Making the tiny images zero mean and
  unit length (normalizing them) will increase performance modestly.

  Useful functions:
  -   cv2.resize
  -   use load_image(path) to load a RGB images and load_image_gray(path) to
      load grayscale images

  Args:
  -   image_paths: list of N elements containing image paths

  Returns:
  -   feats: N x d numpy array of resized and then vectorized tiny images
            e.g. if the images are resized to 16x16, d would be 256
  """
    # dummy feats variable

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################
    d = 16
    N = len(image_paths)
    feats = np.zeros([N, d * d])
    for x in range(N):
        temp = np.reshape(cv2.resize(load_image_gray(image_paths[x]), (d, d)),
                          (1, -1))
        temp -= np.average(temp)
        temp /= (np.sum((temp)**2, axis=None))**0.5  #np.linalg.norm(temp,2)
        feats[x, :] = temp
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
def get_tiny_images(image_paths):
    """
    This feature is inspired by the simple tiny images used as features in
    80 million tiny images: a large dataset for non-parametric object and
    scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
    Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
    pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

    To build a tiny image feature, simply resize the original image to a very
    small square resolution, e.g. 16x16. You can either resize the images to
    square while ignoring their aspect ratio or you can crop the center
    square portion out of each image. Making the tiny images zero mean and
    unit length (normalizing them) will increase performance modestly.

    Useful functions:
    -   cv2.resize
    -   use load_image(path) to load a RGB images and load_image_gray(path) to
        load grayscale images

    Args:
    -   image_paths: list of N elements containing image paths

    Returns:
    -   feats: N x d numpy array of resized and then vectorized tiny images
              e.g. if the images are resized to 16x16, d would be 256
    """
    # parameter
    width = 16

    N = len(image_paths)
    d = width * width
    # dummy feats variable
    feats = np.zeros((N, d))
    for i in range(N):
        image = load_image_gray(image_paths[i])
        image = cv2.resize(image, (width, width),
                           interpolation=cv2.INTER_LINEAR)
        image = np.reshape(image, (1, d))
        image -= np.mean(image)
        image_normalized = image / np.std(image)
        feats[i, :] = image_normalized
    return feats
Example #16
0
def get_tiny_images(image_paths):
    feats = []

    w = 16
    h = 16
    N = len(image_paths)

    for path in (image_paths):
        image = load_image_gray(path)
        img = cv2.resize(image, (w, h))
        feature = np.reshape(img, (1, w * h))
        feature -= np.mean(feature)
        feature /= np.linalg.norm(feature)
        #print(feature.shape)
        feats.append(feature)

    feats = np.asarray(feats)
    feats = np.reshape(feats, (N, w * h))

    return feats
def get_tiny_images(image_paths):
    # dummy feats variable
    feats = []

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################

    for img_path in image_paths:
        img = load_image_gray(img_path).astype(np.float32)
        feat = cv2.resize(img, (24, 24), interpolation=cv2.INTER_AREA)
        feat = feat.flatten()
        feat -= np.mean(feat, dtype=DTYPE)
        feat /= np.linalg.norm(feat)
        feats.append(feat)

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
Example #18
0
def get_tiny_images(image_paths):
    """
  This feature is inspired by the simple tiny images used as features in
  80 million tiny images: a large dataset for non-parametric object and
  scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE
  Transactions on Pattern Analysis and Machine Intelligence, vol.30(11),
  pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/

  To build a tiny image feature, simply resize the original image to a very
  small square resolution, e.g. 16x16. You can either resize the images to
  square while ignoring their aspect ratio or you can crop the center
  square portion out of each image. Making the tiny images zero mean and
  unit length (normalizing them) will increase performance modestly.

  Useful functions:
  -   cv2.resize
  -   use load_image(path) to load a RGB images and load_image_gray(path) to
      load grayscale images

  Args:
  -   image_paths: list of N elements containing image paths

  Returns:
  -   feats: N x d numpy array of resized and then vectorized tiny images
            e.g. if the images are resized to 16x16, d would be 256
  """
    # dummy feats variable
    m = 16
    M = m**2
    v = np.ones((1, M))
    for i in image_paths:
        im = load_image_gray(i)
        im = cv2.resize(im, (m, m))
        im = im.reshape((1, M))
        im = im / np.linalg.norm(im)
        v = np.append(v, im, axis=0)

    feats = v[1:]

    return feats
def build_vocabulary(image_paths, vocab_size):
    # length of the SIFT descriptors that you are going to compute.
    dim = 128
    vocab = np.zeros((vocab_size, dim))

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################
    feats = []
    for i in tqdm.trange(len(image_paths), desc='getting a vocab SIFT'):
        img = load_image_gray(image_paths[i]).astype(np.float32)
        _, descriptors = vlfeat.sift.dsift(img,
                                           step=8,
                                           fast=True,
                                           float_descriptors=True)
        d_norm = np.linalg.norm(descriptors, axis=1)
        idx_nonzero = np.nonzero(d_norm)
        d_norm = d_norm[idx_nonzero]
        descriptors = descriptors[idx_nonzero].astype(DTYPE)
        d_norm = np.linalg.norm(descriptors, axis=1)
        descriptors /= d_norm[:, None]
        feats.append(descriptors)

    feats = np.vstack([feat for feat in feats])
    vocab = vlfeat.kmeans.kmeans(
        np.asarray(feats, dtype=DTYPE),
        vocab_size,
        initialization='PLUSPLUS',  # RANDSEL, PLUSPLUS
        distance='l2',  # l1, l2
        algorithm='LLOYD')  # LLOYD, ELKAN

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return vocab
Example #20
0
def build_gaussian_gmm(image_paths, vocab_size):
    ## Used to build gaussian gmm
    bin_size = 8

    level = 3
    data = []
    for i in range(len(image_paths)):
        for j in range(level):
            image = load_image_gray(image_paths[i])
            G_low = cv2.getGaussianKernel(9, 2)
            filtered_image = cv2.filter2D(image, -1, G_low)
            resize_image = cv2.resize(filtered_image, 0.5 ^ (j - 1))
            [locations,
             SIFT_features] = vlfeat.sift.dsift(resize_image.astype('float32'),
                                                fast=True,
                                                step=15,
                                                bin=8)
            SIFT_features = SIFT_features.astype('float32')
            data = np.hstack(data, SIFT_features)

    [means, covariances, priors] = vlfeat.gmm.gmm(SIFT_features, vocab_size)

    stats = [means, covariances, priors]
    return stats
def kernel_codebook_encoding(image_paths, vocab_filename, gamma = 1):
    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable
    feats = []

    for img_path in image_paths:
        img = load_image_gray(img_path)
        _, descriptors = vlfeat.sift.dsift(img, fast = True, step = 10)
        
        # Equivalent to K-mean center assignments:
        # First, Calculate the distance to the centers defined in vocab
        
        D = sklearn_pairwise.pairwise_distances(descriptors.astype('float64'), 
                                            vocab.astype('float64'), 
                                            metric = 'euclidean')   #(N,vocab_size)
        
        # K(x,u) = exp(-gamma*(x-u)^2/2)
        D = np.exp(-gamma*0.5*D)            #(N,vocab_size)
        
        # Normalize
        D = D/np.sum(D, axis = 1)[:,None]   #(N,vocab_size)--axis=1--> (N,) -[:,None]-> (N,1) 
        
        # hist
        hists = np.sum(D, axis = 0)         #(N,vocab_size)--axis=0--> (vocab_size)
        hists = hists/np.linalg.norm(hists)
        
        feats.append(list(hists))

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return np.array(feats).astype('float64')
def build_vocabulary(image_paths, vocab_size):
  """
  This function will sample SIFT descriptors from the training images,
  cluster them with kmeans, and then return the cluster centers.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
        http://www.vlfeat.org/matlab/vl_dsift.html
          -  frames is a N x 2 matrix of locations, which can be thrown away
          here (but possibly used for extra credit in get_bags_of_sifts if
          you're making a "spatial pyramid").
          -  descriptors is a N x 128 matrix of SIFT features
        Note: there are step, bin size, and smoothing parameters you can
        manipulate for dsift(). We recommend debugging with the 'fast'
        parameter. This approximate version of SIFT is about 20 times faster to
        compute. Also, be sure not to use the default value of step size. It
        will be very slow and you'll see relatively little performance gain
        from extremely dense sampling. You are welcome to use your own SIFT
        feature code! It will probably be slower, though.
  -   cluster_centers = vlfeat.kmeans.kmeans(X, K)
          test_image_feats
            -  X is a N x d numpy array of sampled SIFT features, where N is
               the number of features sampled. N should be pretty large!
            -  K is the number of clusters desired (vocab_size)
               cluster_centers is a K x d matrix of cluster centers. This is
               your vocabulary.

  Args:
  -   image_paths: list of image paths.
  -   vocab_size: size of vocabulary

  Returns:
  -   vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a
      cluster center / visual word
  """
  # Load images from the training set. To save computation time, you don't
  # necessarily need to sample from all images, although it would be better
  # to do so. You can randomly sample the descriptors from each image to save
  # memory and speed up the clustering. Or you can simply call vl_dsift with
  # a large step size here, but a smaller step size in get_bags_of_sifts.
  #
  # For each loaded image, get some SIFT features. You don't have to get as
  # many SIFT features as you will in get_bags_of_sift, because you're only
  # trying to get a representative sample here.
  #
  # Once you have tens of thousands of SIFT features from many training
  # images, cluster them with kmeans. The resulting centroids are now your
  # visual word vocabulary.

  
  #############################################################################
  # TODO: YOUR CODE HERE                                                      #
  #############################################################################
    
  dim = 128      # length of the SIFT descriptors that you are going to compute.

  vocab = np.zeros((vocab_size,dim)) # intialization of vocab
  bag_of_features = [] 
    
    
  for path in image_paths:
        img = np.asarray(load_image_gray(path),dtype='float32') # loading grayscale image and converting it to numpy array
        frames, descriptors = dsift(img, step=[10,10], fast=True) #SIFT descriptor using step size of 10 and fast true 
        bag_of_features.append(descriptors)

  bag_of_features = np.concatenate(bag_of_features, axis=0).astype('float32') #list into an array
    
    
  print("Compute vocab")
  start_time = time.time()
  vocab = kmeans(bag_of_features, vocab_size, initialization="PLUSPLUS") # using kmeans for clusters center       
  end_time = time.time()
  print("It takes ", (end_time - start_time), " to compute vocab.")
  
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return vocab
def get_bags_of_sifts(image_paths, vocab_filename):
  """
  This feature representation is described in the handout, lecture
  materials, and Szeliski chapter 14.
  You will want to construct SIFT features here in the same way you
  did in build_vocabulary() (except for possibly changing the sampling
  rate) and then assign each local feature to its nearest cluster center
  and build a histogram indicating how many times each cluster was used.
  Don't forget to normalize the histogram, or else a larger image with more
  SIFT features will look very different from a smaller version of the same
  image.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
          http://www.vlfeat.org/matlab/vl_dsift.html
        frames is a M x 2 matrix of locations, which can be thrown away here
          (but possibly used for extra credit in get_bags_of_sifts if you're
          making a "spatial pyramid").
        descriptors is a M x 128 matrix of SIFT features
          note: there are step, bin size, and smoothing parameters you can
          manipulate for dsift(). We recommend debugging with the 'fast'
          parameter. This approximate version of SIFT is about 20 times faster
          to compute. Also, be sure not to use the default value of step size.
          It will be very slow and you'll see relatively little performance
          gain from extremely dense sampling. You are welcome to use your own
          SIFT feature code! It will probably be slower, though.
  -   assignments = vlfeat.kmeans.kmeans_quantize(data, vocab)
          finds the cluster assigments for features in data
            -  data is a M x d matrix of image features
            -  vocab is the vocab_size x d matrix of cluster centers
            (vocabulary)
            -  assignments is a Mx1 array of assignments of feature vectors to
            nearest cluster centers, each element is an integer in
            [0, vocab_size)

  Args:
  -   image_paths: paths to N images
  -   vocab_filename: Path to the precomputed vocabulary.
          This function assumes that vocab_filename exists and contains an
          vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid
          or visual word. This ndarray is saved to disk rather than passed in
          as a parameter to avoid recomputing the vocabulary every run.

  Returns:
  -   image_feats: N x d matrix, where d is the dimensionality of the
          feature representation. In this case, d will equal the number of
          clusters or equivalently the number of entries in each image's
          histogram (vocab_size) below.
  """

  #############################################################################
  # TODO: YOUR CODE HERE                                                      #
  #############################################################################

  # load vocabulary
  with open(vocab_filename, 'rb') as f:
    vocab = pickle.load(f)

  # dummy features variable
  feats = []

  start_time = time.time()
  print("Construct bags of sifts...")

  for path in image_paths:
        img = np.asarray(load_image_gray(path),dtype='float32') # reading the image
        frames, descriptors = dsift(img, step=[5,5], fast=True) # SIFT descriptor with step size 5
        dist = distance.cdist(descriptors,vocab, metric='euclidean')# euclidean distance calcualtion from each clusster center 
        closest_vocab = np.argsort(dist,axis=1)[:,0] # sorting the index of distance
        ind ,count = np.unique(closest_vocab,return_counts=True) # finding unique values
        histogram = np.zeros(len(vocab)) 
        histogram[ind] += count 
        histogram = [float(i)/sum(histogram) for i in histogram] # Normalizing histogram
     
        feats.append(histogram) 

  feats = np.asarray(feats) # List to array

  end_time = time.time()
  print("It takes ", (end_time - start_time), " to construct bags of sifts.")





 

  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return feats
Example #24
0
def build_vocabulary(image_paths, vocab_size):
    """
  This function will sample SIFT descriptors from the training images,
  cluster them with kmeans, and then return the cluster centers.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
        http://www.vlfeat.org/matlab/vl_dsift.html
          -  frames is a N x 2 matrix of locations, which can be thrown away
          here (but possibly used for extra credit in get_bags_of_sifts if
          you're making a "spatial pyramid").
          -  descriptors is a N x 128 matrix of SIFT features
        Note: there are step, bin size, and smoothing parameters you can
        manipulate for dsift(). We recommend debugging with the 'fast'
        parameter. This approximate version of SIFT is about 20 times faster to
        compute. Also, be sure not to use the default value of step size. It
        will be very slow and you'll see relatively little performance gain
        from extremely dense sampling. You are welcome to use your own SIFT
        feature code! It will probably be slower, though.
  -   cluster_centers = vlfeat.kmeans.kmeans(X, K)
          http://www.vlfeat.org/matlab/vl_kmeans.html
            -  X is a N x d numpy array of sampled SIFT features, where N is
               the number of features sampled. N should be pretty large!
            -  K is the number of clusters desired (vocab_size)
               cluster_centers is a K x d matrix of cluster centers. This is
               your vocabulary.

  Args:
  -   image_paths: list of image paths.
  -   vocab_size: size of vocabulary

  Returns:
  -   vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a
      cluster center / visual word
  """
    # Load images from the training set. To save computation time, you don't
    # necessarily need to sample from all images, although it would be better
    # to do so. You can randomly sample the descriptors from each image to save
    # memory and speed up the clustering. Or you can simply call vl_dsift with
    # a large step size here, but a smaller step size in get_bags_of_sifts.
    #
    # For each loaded image, get some SIFT features. You don't have to get as
    # many SIFT features as you will in get_bags_of_sift, because you're only
    # trying to get a representative sample here.
    #
    # Once you have tens of thousands of SIFT features from many training
    # images, cluster them with kmeans. The resulting centroids are now your
    # visual word vocabulary.

    dim = 128  # length of the SIFT descriptors that you are going to compute.
    vocab = np.zeros((vocab_size, dim))

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################
    total_SIFT_features = np.zeros((20 * len(image_paths), dim))
    index = 0

    for i in range(len(image_paths)):
        image = load_image_gray(image_paths[i]).astype('float32')

        [locations, SIFT_features] = vlfeat.sift.dsift(image,
                                                       fast=True,
                                                       step=15)

        rand_permutation = np.random.permutation(SIFT_features.shape[0])

        for j in range(20):
            k = rand_permutation[j]
            total_SIFT_features[j + index, :] = SIFT_features[k, :]
        index = index + 20

    vocab = vlfeat.kmeans.kmeans(total_SIFT_features.astype('float32'),
                                 vocab_size)

    # raise NotImplementedError('`build_vocabulary` function in ' +
    #       '`student_code.py` needs to be implemented')

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return vocab
Example #25
0
def get_spyramid_fisher_encoding(image_paths, stat_filename):
    with open(stat_filename, 'rb') as f:
        stats = pickle.load(f)

    means = stats[:, 0:128]
    covariances = stats[:, 128:256]
    priors = stats[:, 257]

    feats = []
    feats_L0 = []
    feats_L1_1 = []
    feats_L1_2 = []
    feats_L1_3 = []
    feats_L1_4 = []
    feats_L2_1 = []
    feats_L2_2 = []
    feats_L2_3 = []
    feats_L2_4 = []
    feats_L2_5 = []
    feats_L2_6 = []
    feats_L2_7 = []
    feats_L2_8 = []
    feats_L2_9 = []
    feats_L2_10 = []
    feats_L2_11 = []
    feats_L2_12 = []
    feats_L2_13 = []
    feats_L2_14 = []
    feats_L2_15 = []
    feats_L2_16 = []

    for i in range(len(image_paths)):
        ##level0
        image = load_image_gray(image_paths[i])
        W = image.shape[0]
        L = image.shape[1]
        [locations,
         SIFT_features_L0] = vlfeat.sift.dsift(image.astype('float32'),
                                               fast=True,
                                               step=5,
                                               bin=8)
        result0 = vlfeats.fisher.fisher(SIFT_features_L0.astype('float32'),
                                        means,
                                        covariances,
                                        priors,
                                        Improved=True)
        feats_L0.append(result0)

        ##level1
        img_L1_1 = image[0:int16(W / 2), 0:int16(L / 2)]
        img_L1_2 = image[0:int16(W / 2), int16(L / 2):L]
        img_L1_3 = image[int16(W / 2):W, 0:int16(L / 2)]
        img_L1_4 = image[int16(W / 2):W, int16(L / 2):L]
        [locations,
         SIFT_features_L1_1] = vlfeat.sift.dsift(img_L1_1.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L1_2] = vlfeat.sift.dsift(img_L1_2.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L1_3] = vlfeat.sift.dsift(img_L1_3.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L1_4] = vlfeat.sift.dsift(img_L1_4.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        result1_1 = vlfeats.fisher.fisher(SIFT_features_L1_1.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result1_2 = vlfeats.fisher.fisher(SIFT_features_L1_2.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result1_3 = vlfeats.fisher.fisher(SIFT_features_L1_3.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result1_4 = vlfeats.fisher.fisher(SIFT_features_L1_4.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        feats_L1_1.append(result1_1)
        feats_L1_2.append(result1_2)
        feats_L1_3.append(result1_3)
        feats_L1_4.append(result1_4)

        ##level2
        img_L2_1 = img[0:int16(W / 4), 0:int16(L / 4)]
        img_L2_2 = img[0:int16(W / 4), int16(L / 4):int16(L / 2)]
        img_L2_3 = img[0:int16(W / 4), int16(L / 2):int16(3 * L / 4)]
        img_L2_4 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L]

        img_L2_5 = img[int16(W / 4):int16(W / 2), 0:int16(L / 4)]
        img_L2_6 = img[int16(W / 4):int16(W / 2), int16(L / 4):int16(L / 2)]
        img_L2_7 = img[int16(W / 4):int16(W / 2),
                       int16(L / 2):int16(3 * L / 4)]
        img_L2_8 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L]

        img_L2_9 = img[int16(W / 2):int16(3 * W / 4), 0:int16(L / 4)]
        img_L2_10 = img[int16(W / 2):int16(3 * W / 4),
                        int16(L / 4):int16(L / 2)]
        img_L2_11 = img[int16(W / 2):int16(3 * W / 4),
                        int16(L / 2):int16(3 * L / 4)]
        img_L2_12 = img[int16(W / 2):int16(3 * W / 4), int16(3 * L / 4):L]

        img_L2_13 = img[int16(3 * W / 4):W, 0:int16(L / 4)]
        img_L2_14 = img[int16(3 * W / 4):W, int16(L / 4):int16(L / 2)]
        img_L2_15 = img[int16(3 * W / 4):W, int16(L / 2):int16(3 * L / 4)]
        img_L2_16 = img[int16(3 * W / 4):W, int16(3 * L / 4):L]

        [locations,
         SIFT_features_L2_1] = vlfeat.sift.dsift(img_L2_1.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_2] = vlfeat.sift.dsift(img_L2_2.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_3] = vlfeat.sift.dsift(img_L2_3.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_4] = vlfeat.sift.dsift(img_L2_4.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)

        [locations,
         SIFT_features_L2_5] = vlfeat.sift.dsift(img_L2_5.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_6] = vlfeat.sift.dsift(img_L2_6.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_7] = vlfeat.sift.dsift(img_L2_7.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_8] = vlfeat.sift.dsift(img_L2_8.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)

        [locations,
         SIFT_features_L2_9] = vlfeat.sift.dsift(img_L2_9.astype('float32'),
                                                 fast=True,
                                                 step=5,
                                                 bin=8)
        [locations,
         SIFT_features_L2_10] = vlfeat.sift.dsift(img_L2_10.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)
        [locations,
         SIFT_features_L2_11] = vlfeat.sift.dsift(img_L2_11.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)
        [locations,
         SIFT_features_L2_12] = vlfeat.sift.dsift(img_L2_12.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)

        [locations,
         SIFT_features_L2_13] = vlfeat.sift.dsift(img_L2_13.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)
        [locations,
         SIFT_features_L2_14] = vlfeat.sift.dsift(img_L2_14.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)
        [locations,
         SIFT_features_L2_15] = vlfeat.sift.dsift(img_L2_15.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)
        [locations,
         SIFT_features_L2_16] = vlfeat.sift.dsift(img_L2_16.astype('float32'),
                                                  fast=True,
                                                  step=5,
                                                  bin=8)

        result2_1 = vlfeats.fisher.fisher(SIFT_features_L2_1.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_2 = vlfeats.fisher.fisher(SIFT_features_L2_2.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_3 = vlfeats.fisher.fisher(SIFT_features_L2_3.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_4 = vlfeats.fisher.fisher(SIFT_features_L2_4.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)

        result2_5 = vlfeats.fisher.fisher(SIFT_features_L2_5.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_6 = vlfeats.fisher.fisher(SIFT_features_L2_6.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_7 = vlfeats.fisher.fisher(SIFT_features_L2_7.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_8 = vlfeats.fisher.fisher(SIFT_features_L2_8.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)

        result2_9 = vlfeats.fisher.fisher(SIFT_features_L2_9.astype('float32'),
                                          means,
                                          covariances,
                                          priors,
                                          Improved=True)
        result2_10 = vlfeats.fisher.fisher(
            SIFT_features_L2_10.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)
        result2_11 = vlfeats.fisher.fisher(
            SIFT_features_L2_11.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)
        result2_12 = vlfeats.fisher.fisher(
            SIFT_features_L2_12.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)

        result2_13 = vlfeats.fisher.fisher(
            SIFT_features_L2_13.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)
        result2_14 = vlfeats.fisher.fisher(
            SIFT_features_L2_14.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)
        result2_15 = vlfeats.fisher.fisher(
            SIFT_features_L2_15.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)
        result2_16 = vlfeats.fisher.fisher(
            SIFT_features_L2_16.astype('float32'),
            means,
            covariances,
            priors,
            Improved=True)

        feats_L2_1.append(result2_1)
        feats_L2_2.append(result2_2)
        feats_L2_3.append(result2_3)
        feats_L2_4.append(result2_4)
        feats_L2_5.append(result2_5)
        feats_L2_6.append(result2_6)
        feats_L2_7.append(result2_7)
        feats_L2_8.append(result2_8)
        feats_L2_9.append(result2_9)
        feats_L2_10.append(result2_10)
        feats_L2_11.append(result2_11)
        feats_L2_12.append(result2_12)
        feats_L2_13.append(result2_13)
        feats_L2_14.append(result2_14)
        feats_L2_15.append(result2_15)
        feats_L2_16.append(result2_16)

        feats = np.append(feats, feats_L0, feats_L1_1, feats_L1_2, feats_L1_3,
                          feats_L1_4, feats_L1_5, feats_L2_1, feats_L2_2,
                          feats_L2_3, feats_L2_4, feats_L2_5, feats_L2_6,
                          feats_L2_7, feats_L2_8, feats_L2_9, feats_L2_10,
                          feats_L2_11, feats_L2_12, feats_L2_13, feats_L2_14,
                          feats_L2_15, feats_L2_16)

    feats = np.array(feats)

    return feats
Example #26
0
def build_spyramid_gmm(image_paths, vocab_size):

    level = 2

    data = []
    for i in range(len(image_paths)):
        image = load_image_gray(image_paths[i])
        W = image.shape[0]
        L = image.shape[1]
        [locations,
         SIFT_features_L0] = vlfeat.sift.dsift(image.astype('float32'),
                                               fast=True,
                                               step=15,
                                               bin=8)
        data = np.hstack(data, SIFT_features_L0)

        img_L1_1 = image[0:int16(W / 2), 0:int16(L / 2)]
        img_L1_2 = image[0:int16(W / 2), int16(L / 2):L]
        img_L1_3 = image[int16(W / 2):W, 0:int16(L / 2)]
        img_L1_4 = image[int16(W / 2):W, int16(L / 2):L]
        [locations,
         SIFT_features_L1_1] = vlfeat.sift.dsift(img_L1_1.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L1_2] = vlfeat.sift.dsift(img_L1_2.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L1_3] = vlfeat.sift.dsift(img_L1_3.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L1_4] = vlfeat.sift.dsift(img_L1_4.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        data = np.hstack(data, SIFT_features_L1_1, SIFT_features_L1_2,
                         SIFT_features_L1_3, SIFT_features_L1_4)

        img_L2_1 = img[0:int16(W / 4), 0:int16(L / 4)]
        img_L2_2 = img[0:int16(W / 4), int16(L / 4):int16(L / 2)]
        img_L2_3 = img[0:int16(W / 4), int16(L / 2):int16(3 * L / 4)]
        img_L2_4 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L]

        img_L2_5 = img[int16(W / 4):int16(W / 2), 0:int16(L / 4)]
        img_L2_6 = img[int16(W / 4):int16(W / 2), int16(L / 4):int16(L / 2)]
        img_L2_7 = img[int16(W / 4):int16(W / 2),
                       int16(L / 2):int16(3 * L / 4)]
        img_L2_8 = img[int16(W / 4):int16(W / 2), int16(3 * L / 4):L]

        img_L2_9 = img[int16(W / 2):int16(3 * W / 4), 0:int16(L / 4)]
        img_L2_10 = img[int16(W / 2):int16(3 * W / 4),
                        int16(L / 4):int16(L / 2)]
        img_L2_11 = img[int16(W / 2):int16(3 * W / 4),
                        int16(L / 2):int16(3 * L / 4)]
        img_L2_12 = img[int16(W / 2):int16(3 * W / 4), int16(3 * L / 4):L]

        img_L2_13 = img[int16(3 * W / 4):W, 0:int16(L / 4)]
        img_L2_14 = img[int16(3 * W / 4):W, int16(L / 4):int16(L / 2)]
        img_L2_15 = img[int16(3 * W / 4):W, int16(L / 2):int16(3 * L / 4)]
        img_L2_16 = img[int16(3 * W / 4):W, int16(3 * L / 4):L]

        [locations,
         SIFT_features_L2_1] = vlfeat.sift.dsift(img_L2_1.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_2] = vlfeat.sift.dsift(img_L2_2.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_3] = vlfeat.sift.dsift(img_L2_3.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_4] = vlfeat.sift.dsift(img_L2_4.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        data = np.hstack(data, SIFT_features_L2_1, SIFT_features_L2_2,
                         SIFT_features_L2_3, SIFT_features_L2_4)

        [locations,
         SIFT_features_L2_5] = vlfeat.sift.dsift(img_L2_5.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_6] = vlfeat.sift.dsift(img_L2_6.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_7] = vlfeat.sift.dsift(img_L2_7.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_8] = vlfeat.sift.dsift(img_L2_8.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        data = np.hstack(data, SIFT_features_L2_5, SIFT_features_L2_6,
                         SIFT_features_L2_7, SIFT_features_L2_8)

        [locations,
         SIFT_features_L2_9] = vlfeat.sift.dsift(img_L2_9.astype('float32'),
                                                 fast=True,
                                                 step=15,
                                                 bin=8)
        [locations,
         SIFT_features_L2_10] = vlfeat.sift.dsift(img_L2_10.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        [locations,
         SIFT_features_L2_11] = vlfeat.sift.dsift(img_L2_11.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        [locations,
         SIFT_features_L2_12] = vlfeat.sift.dsift(img_L2_12.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        data = np.hstack(data, SIFT_features_L2_9, SIFT_features_L2_10,
                         SIFT_features_L2_11, SIFT_features_L2_12)

        [locations,
         SIFT_features_L2_13] = vlfeat.sift.dsift(img_L2_13.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        [locations,
         SIFT_features_L2_14] = vlfeat.sift.dsift(img_L2_14.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        [locations,
         SIFT_features_L2_15] = vlfeat.sift.dsift(img_L2_15.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        [locations,
         SIFT_features_L2_16] = vlfeat.sift.dsift(img_L2_16.astype('float32'),
                                                  fast=True,
                                                  step=15,
                                                  bin=8)
        data = np.hstack(data, SIFT_features_L2_13, SIFT_features_L2_14,
                         SIFT_features_L2_15, SIFT_features_L2_16)

    [means, covariances, priors] = vlfeat.gmm.gmm(SIFT_features, vocab_size)

    stats = [means, covariances, priors]

    return stats
Example #27
0
def get_bags_of_sifts(image_paths, vocab_filename):
    """
  This feature representation is described in the handout, lecture
  materials, and Szeliski chapter 14.
  You will want to construct SIFT features here in the same way you
  did in build_vocabulary() (except for possibly changing the sampling
  rate) and then assign each local feature to its nearest cluster center
  and build a histogram indicating how many times each cluster was used.
  Don't forget to normalize the histogram, or else a larger image with more
  SIFT features will look very different from a smaller version of the same
  image.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
          http://www.vlfeat.org/matlab/vl_dsift.html
        frames is a M x 2 matrix of locations, which can be thrown away here
          (but possibly used for extra credit in get_bags_of_sifts if you're
          making a "spatial pyramid").
        descriptors is a M x 128 matrix of SIFT features
          note: there are step, bin size, and smoothing parameters you can
          manipulate for dsift(). We recommend debugging with the 'fast'
          parameter. This approximate version of SIFT is about 20 times faster
          to compute. Also, be sure not to use the default value of step size.
          It will be very slow and you'll see relatively little performance
          gain from extremely dense sampling. You are welcome to use your own
          SIFT feature code! It will probably be slower, though.
  -   assignments = vlfeat.kmeans.kmeans_quantize(data, vocab)
          finds the cluster assigments for features in data
            -  data is a M x d matrix of image features
            -  vocab is the vocab_size x d matrix of cluster centers
            (vocabulary)
            -  assignments is a Mx1 array of assignments of feature vectors to
            nearest cluster centers, each element is an integer in
            [0, vocab_size)

  Args:
  -   image_paths: paths to N images
  -   vocab_filename: Path to the precomputed vocabulary.
          This function assumes that vocab_filename exists and contains an
          vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid
          or visual word. This ndarray is saved to disk rather than passed in
          as a parameter to avoid recomputing the vocabulary every run.

  Returns:
  -   image_feats: N x d matrix, where d is the dimensionality of the
          feature representation. In this case, d will equal the number of
          clusters or equivalently the number of entries in each image's
          histogram (vocab_size) below.
  """
    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable

    vocab_size = vocab.shape[0]
    feats = []

    for i in range(len(image_paths)):
        image = load_image_gray(image_paths[i]).astype('float32')
        [locations, SIFT_features] = vlfeat.sift.dsift(image,
                                                       fast=True,
                                                       step=10)
        SIFT_features = SIFT_features.astype('float32')

        Hist = np.zeros(vocab_size)
        D = sklearn_pairwise.pairwise_distances(SIFT_features, vocab)
        for j in D:
            closet = np.argmin(a=j, axis=0)
            Hist[closet] += 1

        Hist = Hist / np.linalg.norm(Hist)

        feats.append(Hist)

        # assignments = vlfeat.kmeans.kmeans_quantize(SIFT_features, vocab)
        # map_to_bins = np.digitize(assignments, bins)
        # Hist = np.zeros(bins.shape)
        # for j in map_to_bins:
        #   Hist[j-1] += 1
        # Hist = Hist/np.linalg.norm(Hist)
        # feats.append(Hist)

    # print(Hist.shape)
    # print(assignments.shape)
    feats = np.array(feats)

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################

    # raise NotImplementedError('`get_bags_of_sifts` function in ' +
    #       '`student_code.py` needs to be implemented')

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return feats
def build_vocabulary(image_paths, vocab_size):
    """
  This function will sample SIFT descriptors from the training images,
  cluster them with kmeans, and then return the cluster centers.

  Useful functions:
  -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
  -   frames, descriptors = vlfeat.sift.dsift(img)
        http://www.vlfeat.org/matlab/vl_dsift.html
          -  frames is a N x 2 matrix of locations, which can be thrown away
          here (but possibly used for extra credit in get_bags_of_sifts if
          you're making a "spatial pyramid").
          -  descriptors is a N x 128 matrix of SIFT features
        Note: there are step, bin size, and smoothing parameters you can
        manipulate for dsift(). We recommend debugging with the 'fast'
        parameter. This approximate version of SIFT is about 20 times faster to
        compute. Also, be sure not to use the default value of step size. It
        will be very slow and you'll see relatively little performance gain
        from extremely dense sampling. You are welcome to use your own SIFT
        feature code! It will probably be slower, though.
  -   cluster_centers = vlfeat.kmeans.kmeans(X, K)
          http://www.vlfeat.org/matlab/vl_kmeans.html
            -  X is a N x d numpy array of sampled SIFT features, where N is
               the number of features sampled. N should be pretty large!
            -  K is the number of clusters desired (vocab_size)
               cluster_centers is a K x d matrix of cluster centers. This is
               your vocabulary.

  Args:
  -   image_paths: list of image paths.
  -   vocab_size: size of vocabulary

  Returns:
  -   vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a
      cluster center / visual word
  """
    # Load images from the training set. To save computation time, you don't
    # necessarily need to sample from all images, although it would be better
    # to do so. You can randomly sample the descriptors from each image to save
    # memory and speed up the clustering. Or you can simply call vl_dsift with
    # a large step size here, but a smaller step size in get_bags_of_sifts.
    #
    # For each loaded image, get some SIFT features. You don't have to get as
    # many SIFT features as you will in get_bags_of_sift, because you're only
    # trying to get a representative sample here.
    #
    # Once you have tens of thousands of SIFT features from many training
    # images, cluster them with kmeans. The resulting centroids are now your
    # visual word vocabulary.

    dim = 128  # length of the SIFT descriptors that you are going to compute.
    vocab = np.zeros((vocab_size, dim))
    count = len(image_paths)
    #count=1
    SIFT = []
    #print(np.shape(image_paths))
    """
  
  a =load_image_gray(image_paths[2])
  print(np.shape(a))
  a1 = a.astype('float32')
  frames, descriptors = vlfeat.sift.dsift(a1,15,8,fast=True)
  print(np.shape(descriptors))
  print(np.shape(frames))
  """
    #counts=4
    SIFT = np.zeros((1, 128))

    for y in range(count):
        a = load_image_gray(image_paths[y])
        #print(a)
        #print(np.shape(a))
        #print(type(a))
        #a1 = a.astype('float32')
        #print(np.shape(a1))
        #print(type(a1))
        frames, descriptors = vlfeat.sift.dsift(a, step=15, fast=True)
        #print("hi")
        #print(descriptors)
        #print(np.shape(descriptors))

        #print(np.shape(descriptors))
        #SIFT.append(descriptors)
        SIFT = np.vstack((SIFT, descriptors))
        #size=np.shape(descriptors)
        #var=size[1]
        #a1=np.random.permutation(var)
        #print(np.shape(SIFT))
    #print(np.shape(SIFT))
    #print(SIFT)
    bh = np.shape(SIFT)
    bh1 = bh[0]
    SIFT = SIFT[1:bh1, :]
    #print(np.shape(SIFT))
    b1 = np.float32(SIFT)
    #print(b1)
    cluster_centers = vlfeat.kmeans.kmeans(b1, vocab_size)
    #print(np.shape(cluster_centers))
    #print(cluster_centers)
    vocab = cluster_centers
    #print(vocab)
    print(np.shape(vocab))

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return vocab
def build_vocabulary(image_paths, vocab_size):
    """
    This function will sample SIFT descriptors from the training images,
    cluster them with kmeans, and then return the cluster centers.

    Useful functions:
    -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
    -   frames, descriptors = vlfeat.sift.dsift(img)
        http://www.vlfeat.org/matlab/vl_dsift.html
          -  frames is a N x 2 matrix of locations, which can be thrown away
          here (but possibly used for extra credit in get_bags_of_sifts if
          you're making a "spatial pyramid").
          -  descriptors is a N x 128 matrix of SIFT features
        Note: there are step, bin size, and smoothing parameters you can
        manipulate for dsift(). We recommend debugging with the 'fast'
        parameter. This approximate version of SIFT is about 20 times faster to
        compute. Also, be sure not to use the default value of step size. It
        will be very slow and you'll see relatively little performance gain
        from extremely dense sampling. You are welcome to use your own SIFT
        feature code! It will probably be slower, though.
    -   cluster_centers = vl_kmeans(X, K)
          http://www.vlfeat.org/matlab/vl_kmeans.html
            -  X is a N x d numpy array of sampled SIFT features, where N is
               the number of features sampled. N should be pretty large!
            -  K is the number of clusters desired (vocab_size)
               cluster_centers is a K x d matrix of cluster centers. This is
               your vocabulary.

    Args:
    -   image_paths: list of image paths.
    -   vocab_size: size of vocabulary

    Returns:
    -   vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a
      cluster center / visual word
    """
    # Load images from the training set. To save computation time, you don't
    # necessarily need to sample from all images, although it would be better
    # to do so. You can randomly sample the descriptors from each image to save
    # memory and speed up the clustering. Or you can simply call vl_dsift with
    # a large step size here, but a smaller step size in get_bags_of_sifts.
    #
    # For each loaded image, get some SIFT features. You don't have to get as
    # many SIFT features as you will in get_bags_of_sift, because you're only
    # trying to get a representative sample here.
    #
    # Once you have tens of thousands of SIFT features from many training
    # images, cluster them with kmeans. The resulting centroids are now your
    # visual word vocabulary.

    dim = 128      # length of the SIFT descriptors that you are going to compute.
    vocab = np.zeros((vocab_size,dim))

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################
    
    # Loop over the images and randomly sample the descriptors
    stack_desciptors = []
    sample_size = int(20000//len(image_paths))  #take 20 000 sift features in total
    #print(sample_size,len(image_paths))
    
    for img_path in image_paths:
        img = load_image_gray(img_path)
        _, descriptors = vlfeat.sift.dsift(img, fast = True, step = 20)
        
        sample_indexes = np.random.permutation(len(descriptors))[:sample_size]
        sample_descriptors = descriptors[sample_indexes]
        
        stack_desciptors.append(sample_descriptors)
    
    stack_desciptors = np.array(stack_desciptors).reshape(-1, dim)
    
    # K_mean clustering to find the center
    kmeans = KMeans(n_clusters=vocab_size, random_state=0).fit(stack_desciptors)
    vocab = kmeans.cluster_centers_

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return vocab
def get_bags_of_sifts(image_paths, vocab_filename):
    """
    This feature representation is described in the handout, lecture
    materials, and Szeliski chapter 14.
    You will want to construct SIFT features here in the same way you
    did in build_vocabulary() (except for possibly changing the sampling
    rate) and then assign each local feature to its nearest cluster center
    and build a histogram indicating how many times each cluster was used.
    Don't forget to normalize the histogram, or else a larger image with more
    SIFT features will look very different from a smaller version of the same
    image.

    Useful functions:
    -   Use load_image(path) to load RGB images and load_image_gray(path) to load
          grayscale images
    -   frames, descriptors = vlfeat.sift.dsift(img)
          http://www.vlfeat.org/matlab/vl_dsift.html
        frames is a M x 2 matrix of locations, which can be thrown away here
          (but possibly used for extra credit in get_bags_of_sifts if you're
          making a "spatial pyramid").
        descriptors is a M x 128 matrix of SIFT features
          note: there are step, bin size, and smoothing parameters you can
          manipulate for dsift(). We recommend debugging with the 'fast'
          parameter. This approximate version of SIFT is about 20 times faster
          to compute. Also, be sure not to use the default value of step size.
          It will be very slow and you'll see relatively little performance
          gain from extremely dense sampling. You are welcome to use your own
          SIFT feature code! It will probably be slower, though.
    -   assignments = vlfeat.kmeans.kmeans_quantize(data, vocab)
          finds the cluster assigments for features in data
            -  data is a M x d matrix of image features
            -  vocab is the vocab_size x d matrix of cluster centers
            (vocabulary)
            -  assignments is a Mx1 array of assignments of feature vectors to
            nearest cluster centers, each element is an integer in
            [0, vocab_size)

    Args:
    -   image_paths: paths to N images
    -   vocab_filename: Path to the precomputed vocabulary.
          This function assumes that vocab_filename exists and contains an
          vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid
          or visual word. This ndarray is saved to disk rather than passed in
          as a parameter to avoid recomputing the vocabulary every run.

    Returns:
    -   image_feats: N x d matrix, where d is the dimensionality of the
          feature representation. In this case, d will equal the number of
          clusters or equivalently the number of entries in each image's
          histogram (vocab_size) below.
    """
    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable
    feats = []

    #############################################################################
    # TODO: YOUR CODE HERE                                                      #
    #############################################################################
    
    for img_path in image_paths:
        img = load_image_gray(img_path)
        _, descriptors = vlfeat.sift.dsift(img, fast = True, step = 10)
        
        # Equivalent to K-mean center assignments:
        # First, Calculate the distance to the centers defined in vocab
        
        D = sklearn_pairwise.pairwise_distances(descriptors.astype('float64'), 
                                            vocab.astype('float64'), 
                                            metric = 'euclidean')   #(N,vocab_size)
        
        # Second, Assign label + bincount the label + normalize + append
        
        labels = np.argmin(D, axis = 1)  #(N,vocab_size) --axis=1--> (N,)
        
        hists = np.bincount(labels, minlength = len(vocab))  # minlength as len(vocab)
        hists = hists/np.linalg.norm(hists)
        
        feats.append(list(hists))

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return np.array(feats).astype('float64')