def __compute_vocabulary(cx2_desc, train_cxs, vocab_size, cache_dir=None): '''Computes a vocabulary of size vocab_size given a set of training data''' # Read params akm_flann_params = params.BOW_AKMEANS_FLANN_PARAMS words_flann_params = params.BOW_WORDS_FLANN_PARAMS max_iters = params.AKMEANS_MAX_ITERS # Make a training set of descriptors to build the vocabulary tx2_desc = cx2_desc[train_cxs] train_desc = np.vstack(tx2_desc) num_train_desc = train_desc.shape[0] if vocab_size > num_train_desc: msg = '[mc2] vocab_size(%r) > #train_desc(%r)' % (vocab_size, num_train_desc) helpers.printWARN(msg) vocab_size = num_train_desc / 2 # Cluster descriptors into a visual vocabulary matcher_uid = params.get_matcher_uid(with_train=True, with_indx=False) words_uid = 'words_' + matcher_uid _, words = algos.precompute_akmeans(train_desc, vocab_size, max_iters, akm_flann_params, cache_dir, force_recomp=False, same_data=False, uid=words_uid) # Index the vocabulary for fast nearest neighbor search words_flann = algos.precompute_flann(words, cache_dir, uid=words_uid, flann_params=words_flann_params) return words, words_flann
def __compute_vocabulary(cx2_desc, train_cxs, vocab_size, cache_dir=None): '''Computes a vocabulary of size vocab_size given a set of training data''' # Read params akm_flann_params = params.BOW_AKMEANS_FLANN_PARAMS words_flann_params = params.BOW_WORDS_FLANN_PARAMS max_iters = params.AKMEANS_MAX_ITERS # Make a training set of descriptors to build the vocabulary tx2_desc = cx2_desc[train_cxs] train_desc = np.vstack(tx2_desc) num_train_desc = train_desc.shape[0] if vocab_size > num_train_desc: msg = '[mc2] vocab_size(%r) > #train_desc(%r)' % (vocab_size, num_train_desc) helpers.printWARN(msg) vocab_size = num_train_desc / 2 # Cluster descriptors into a visual vocabulary matcher_uid = params.get_matcher_uid(with_train=True, with_indx=False) words_uid = 'words_'+matcher_uid _, words = algos.precompute_akmeans(train_desc, vocab_size, max_iters, akm_flann_params, cache_dir, force_recomp=False, same_data=False, uid=words_uid) # Index the vocabulary for fast nearest neighbor search words_flann = algos.precompute_flann(words, cache_dir, uid=words_uid, flann_params=words_flann_params) return words, words_flann
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs, cache_dir): '''Assigns each database chip a visual-vector and returns data for the inverted file''' # TODO: Save precomputations here print('[mc2] Assigning each database chip a bag-of-words vector') num_indexed = len(indexed_cxs) ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs) # Build UID matcher_uid = params.get_matcher_uid() data_uid = helpers.hashstr(ax2_desc) uid = data_uid + '_' + matcher_uid try: cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse wx2_cxs = io.smart_load(cache_dir, 'wx2_cxs', uid, '.npy') wx2_fxs = io.smart_load(cache_dir, 'wx2_fxs', uid, '.npy') wx2_idf = io.smart_load(cache_dir, 'wx2_idf', uid, '.npy') print('[mc2] successful cache load: vocabulary indexed databased.') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf #helpers.CacheException as ex: except IOError as ex: print(repr(ex)) print('[mc2] quantizing each descriptor to a word') # Assign each descriptor to its nearest visual word print('[mc2] ...this may take awhile with no indication of progress') tt1 = helpers.Timer('quantizing each descriptor to a word') ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128) tt1.toc() # Build inverse word to ax tt2 = helpers.Timer('database_indexing') print('') print('[mc2] building inverse word to ax map') wx2_axs = [[] for _ in xrange(len(words))] for ax, wx in enumerate(ax2_wx): wx2_axs[wx].append(ax) # Compute inverted file: words -> database print('[mc2] building inverted file word -> database') wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs]) wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs]) # Build sparse visual vectors with term frequency weights print('[mc2] building sparse visual words') coo_cols = ax2_wx coo_rows = ax2_cx coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE) coo_format = (coo_values, (coo_rows, coo_cols)) coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True) cx2_tf_vvec = spsparse.csr_matrix(coo_cx2_vvec, copy=False) # Compute idf_w = log(Number of documents / Number of docs containing word_j) print('[mc2] computing tf-idf') wx2_df = np.array([len(set(cxs)) + 1 for cxs in wx2_cxs], dtype=np.float) wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df)) # Compute tf-idf print('[mc2] preweighting with tf-idf') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) # Normalize print('[mc2] normalizing') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec) tt2.toc() # Save to cache print('[mc2] saving to cache') r''' input_data = ax2_desc data = cx2_vvec uid='cx2_vvec'+matcher_uid ''' io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse io.smart_save(wx2_cxs, cache_dir, 'wx2_cxs', uid, '.npy') io.smart_save(wx2_fxs, cache_dir, 'wx2_fxs', uid, '.npy') io.smart_save(wx2_idf, cache_dir, 'wx2_idf', uid, '.npy') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs, cache_dir): '''Assigns each database chip a visual-vector and returns data for the inverted file''' # TODO: Save precomputations here print('[mc2] Assigning each database chip a bag-of-words vector') num_indexed = len(indexed_cxs) ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs) # Build UID matcher_uid = params.get_matcher_uid() data_uid = helpers.hashstr(ax2_desc) uid = data_uid + '_' + matcher_uid try: cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse wx2_cxs = io.smart_load(cache_dir, 'wx2_cxs', uid, '.npy') wx2_fxs = io.smart_load(cache_dir, 'wx2_fxs', uid, '.npy') wx2_idf = io.smart_load(cache_dir, 'wx2_idf', uid, '.npy') print('[mc2] successful cache load: vocabulary indexed databased.') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf #helpers.CacheException as ex: except IOError as ex: print(repr(ex)) print('[mc2] quantizing each descriptor to a word') # Assign each descriptor to its nearest visual word print('[mc2] ...this may take awhile with no indication of progress') tt1 = helpers.Timer('quantizing each descriptor to a word') ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128) tt1.toc() # Build inverse word to ax tt2 = helpers.Timer('database_indexing') print('') print('[mc2] building inverse word to ax map') wx2_axs = [[] for _ in xrange(len(words))] for ax, wx in enumerate(ax2_wx): wx2_axs[wx].append(ax) # Compute inverted file: words -> database print('[mc2] building inverted file word -> database') wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs]) wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs]) # Build sparse visual vectors with term frequency weights print('[mc2] building sparse visual words') coo_cols = ax2_wx coo_rows = ax2_cx coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE) coo_format = (coo_values, (coo_rows, coo_cols)) coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True) cx2_tf_vvec = spsparse.csr_matrix(coo_cx2_vvec, copy=False) # Compute idf_w = log(Number of documents / Number of docs containing word_j) print('[mc2] computing tf-idf') wx2_df = np.array([len(set(cxs))+1 for cxs in wx2_cxs], dtype=np.float) wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df)) # Compute tf-idf print('[mc2] preweighting with tf-idf') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) # Normalize print('[mc2] normalizing') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec) tt2.toc() # Save to cache print('[mc2] saving to cache') r''' input_data = ax2_desc data = cx2_vvec uid='cx2_vvec'+matcher_uid ''' io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse io.smart_save(wx2_cxs, cache_dir, 'wx2_cxs', uid, '.npy') io.smart_save(wx2_fxs, cache_dir, 'wx2_fxs', uid, '.npy') io.smart_save(wx2_idf, cache_dir, 'wx2_idf', uid, '.npy') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf