Esempio n. 1
0
def __compute_vocabulary(cx2_desc, train_cxs, vocab_size, cache_dir=None):
    '''Computes a vocabulary of size vocab_size given a set of training data'''
    # Read params
    akm_flann_params = params.BOW_AKMEANS_FLANN_PARAMS
    words_flann_params = params.BOW_WORDS_FLANN_PARAMS
    max_iters = params.AKMEANS_MAX_ITERS
    # Make a training set of descriptors to build the vocabulary
    tx2_desc = cx2_desc[train_cxs]
    train_desc = np.vstack(tx2_desc)
    num_train_desc = train_desc.shape[0]
    if vocab_size > num_train_desc:
        msg = '[mc2] vocab_size(%r) > #train_desc(%r)' % (vocab_size,
                                                          num_train_desc)
        helpers.printWARN(msg)
        vocab_size = num_train_desc / 2
    # Cluster descriptors into a visual vocabulary
    matcher_uid = params.get_matcher_uid(with_train=True, with_indx=False)
    words_uid = 'words_' + matcher_uid
    _, words = algos.precompute_akmeans(train_desc,
                                        vocab_size,
                                        max_iters,
                                        akm_flann_params,
                                        cache_dir,
                                        force_recomp=False,
                                        same_data=False,
                                        uid=words_uid)
    # Index the vocabulary for fast nearest neighbor search
    words_flann = algos.precompute_flann(words,
                                         cache_dir,
                                         uid=words_uid,
                                         flann_params=words_flann_params)
    return words, words_flann
Esempio n. 2
0
def __compute_vocabulary(cx2_desc, train_cxs, vocab_size, cache_dir=None):
    '''Computes a vocabulary of size vocab_size given a set of training data'''
    # Read params
    akm_flann_params   = params.BOW_AKMEANS_FLANN_PARAMS
    words_flann_params = params.BOW_WORDS_FLANN_PARAMS
    max_iters          = params.AKMEANS_MAX_ITERS
    # Make a training set of descriptors to build the vocabulary
    tx2_desc   = cx2_desc[train_cxs]
    train_desc = np.vstack(tx2_desc)
    num_train_desc = train_desc.shape[0]
    if vocab_size > num_train_desc:
        msg = '[mc2] vocab_size(%r) > #train_desc(%r)' % (vocab_size, num_train_desc)
        helpers.printWARN(msg)
        vocab_size = num_train_desc / 2
    # Cluster descriptors into a visual vocabulary
    matcher_uid = params.get_matcher_uid(with_train=True, with_indx=False)
    words_uid   = 'words_'+matcher_uid
    _, words = algos.precompute_akmeans(train_desc, vocab_size, max_iters,
                                        akm_flann_params, cache_dir,
                                        force_recomp=False, same_data=False,
                                        uid=words_uid)
    # Index the vocabulary for fast nearest neighbor search
    words_flann = algos.precompute_flann(words, cache_dir, uid=words_uid,
                                         flann_params=words_flann_params)
    return words, words_flann
Esempio n. 3
0
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs,
                                   cache_dir):
    '''Assigns each database chip a visual-vector and returns 
       data for the inverted file'''
    # TODO: Save precomputations here
    print('[mc2] Assigning each database chip a bag-of-words vector')
    num_indexed = len(indexed_cxs)
    ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs)
    # Build UID
    matcher_uid = params.get_matcher_uid()
    data_uid = helpers.hashstr(ax2_desc)
    uid = data_uid + '_' + matcher_uid
    try:
        cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl')  #sparse
        wx2_cxs = io.smart_load(cache_dir, 'wx2_cxs', uid, '.npy')
        wx2_fxs = io.smart_load(cache_dir, 'wx2_fxs', uid, '.npy')
        wx2_idf = io.smart_load(cache_dir, 'wx2_idf', uid, '.npy')
        print('[mc2] successful cache load: vocabulary indexed databased.')
        return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
    #helpers.CacheException as ex:
    except IOError as ex:
        print(repr(ex))

    print('[mc2] quantizing each descriptor to a word')
    # Assign each descriptor to its nearest visual word
    print('[mc2] ...this may take awhile with no indication of progress')
    tt1 = helpers.Timer('quantizing each descriptor to a word')
    ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128)
    tt1.toc()
    # Build inverse word to ax
    tt2 = helpers.Timer('database_indexing')
    print('')
    print('[mc2] building inverse word to ax map')
    wx2_axs = [[] for _ in xrange(len(words))]
    for ax, wx in enumerate(ax2_wx):
        wx2_axs[wx].append(ax)
    # Compute inverted file: words -> database
    print('[mc2] building inverted file word -> database')
    wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs])
    wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs])
    # Build sparse visual vectors with term frequency weights
    print('[mc2] building sparse visual words')
    coo_cols = ax2_wx
    coo_rows = ax2_cx
    coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE)
    coo_format = (coo_values, (coo_rows, coo_cols))
    coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True)
    cx2_tf_vvec = spsparse.csr_matrix(coo_cx2_vvec, copy=False)
    # Compute idf_w = log(Number of documents / Number of docs containing word_j)
    print('[mc2] computing tf-idf')
    wx2_df = np.array([len(set(cxs)) + 1 for cxs in wx2_cxs], dtype=np.float)
    wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df))
    # Compute tf-idf
    print('[mc2] preweighting with tf-idf')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    # Normalize
    print('[mc2] normalizing')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec)
    tt2.toc()
    # Save to cache
    print('[mc2] saving to cache')
    r'''
    input_data = ax2_desc
    data = cx2_vvec
    uid='cx2_vvec'+matcher_uid
    '''
    io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl')  #sparse
    io.smart_save(wx2_cxs, cache_dir, 'wx2_cxs', uid, '.npy')
    io.smart_save(wx2_fxs, cache_dir, 'wx2_fxs', uid, '.npy')
    io.smart_save(wx2_idf, cache_dir, 'wx2_idf', uid, '.npy')
    return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
Esempio n. 4
0
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs, cache_dir):
    '''Assigns each database chip a visual-vector and returns 
       data for the inverted file'''
    # TODO: Save precomputations here
    print('[mc2] Assigning each database chip a bag-of-words vector')
    num_indexed = len(indexed_cxs)
    ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs)
    # Build UID
    matcher_uid  = params.get_matcher_uid()
    data_uid = helpers.hashstr(ax2_desc)
    uid = data_uid + '_' + matcher_uid
    try: 
        cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse
        wx2_cxs  = io.smart_load(cache_dir, 'wx2_cxs',  uid, '.npy')
        wx2_fxs  = io.smart_load(cache_dir, 'wx2_fxs',  uid, '.npy')
        wx2_idf  = io.smart_load(cache_dir, 'wx2_idf',  uid, '.npy')
        print('[mc2] successful cache load: vocabulary indexed databased.')
        return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
    #helpers.CacheException as ex:
    except IOError as ex:
        print(repr(ex))

    print('[mc2] quantizing each descriptor to a word')
    # Assign each descriptor to its nearest visual word
    print('[mc2] ...this may take awhile with no indication of progress')
    tt1 = helpers.Timer('quantizing each descriptor to a word')
    ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128)
    tt1.toc()
    # Build inverse word to ax
    tt2 = helpers.Timer('database_indexing')
    print('')
    print('[mc2] building inverse word to ax map')
    wx2_axs = [[] for _ in xrange(len(words))]
    for ax, wx in enumerate(ax2_wx):
        wx2_axs[wx].append(ax)
    # Compute inverted file: words -> database
    print('[mc2] building inverted file word -> database')
    wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs])
    wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs])
    # Build sparse visual vectors with term frequency weights 
    print('[mc2] building sparse visual words')
    coo_cols = ax2_wx  
    coo_rows = ax2_cx
    coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE)
    coo_format = (coo_values, (coo_rows, coo_cols))
    coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True)
    cx2_tf_vvec  = spsparse.csr_matrix(coo_cx2_vvec, copy=False)
    # Compute idf_w = log(Number of documents / Number of docs containing word_j)
    print('[mc2] computing tf-idf')
    wx2_df  = np.array([len(set(cxs))+1 for cxs in wx2_cxs], dtype=np.float)
    wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df))
    # Compute tf-idf
    print('[mc2] preweighting with tf-idf')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    # Normalize
    print('[mc2] normalizing')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec)
    tt2.toc()
    # Save to cache
    print('[mc2] saving to cache')
    r'''
    input_data = ax2_desc
    data = cx2_vvec
    uid='cx2_vvec'+matcher_uid
    '''
    io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse
    io.smart_save(wx2_cxs,  cache_dir, 'wx2_cxs',  uid, '.npy')
    io.smart_save(wx2_fxs,  cache_dir, 'wx2_fxs',  uid, '.npy')
    io.smart_save(wx2_idf,  cache_dir, 'wx2_idf',  uid, '.npy')
    return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf