コード例 #1
0
def __quantize_desc_to_tfidf_vvec(desc, wx2_idf, words, words_flann):
    # Assign each descriptor to its nearest visual word
    #desc = np.array(desc_, params.__BOW_DTYPE__)
    fx2_wx, _ = words_flann.nn_index(desc, 1, checks=128)
    #TODO: soft assignment here
    # Build sparse visual vectors with term frequency weights
    lil_vvec = spsparse.lil_matrix((len(words), 1))
    for wx in iter(fx2_wx):
        lil_vvec[wx, 0] += 1
    tf_vvec = spsparse.csr_matrix(lil_vvec.T, copy=False)
    # Compute tf-idf
    tfidf_vvec = algos.sparse_multiply_rows(tf_vvec, wx2_idf)
    # Normalize
    vvec = algos.sparse_normalize_rows(tfidf_vvec)
    return vvec, fx2_wx
コード例 #2
0
ファイル: bagofwords.py プロジェクト: Erotemic/hotspotter
def __quantize_desc_to_tfidf_vvec(desc, wx2_idf, words, words_flann):
    # Assign each descriptor to its nearest visual word
    #desc = np.array(desc_, params.__BOW_DTYPE__)
    fx2_wx, _ = words_flann.nn_index(desc, 1, checks=128)
    #TODO: soft assignment here
    # Build sparse visual vectors with term frequency weights 
    lil_vvec = spsparse.lil_matrix((len(words),1))
    for wx in iter(fx2_wx):
        lil_vvec[wx, 0] += 1
    tf_vvec = spsparse.csr_matrix(lil_vvec.T, copy=False)
    # Compute tf-idf
    tfidf_vvec = algos.sparse_multiply_rows(tf_vvec, wx2_idf)
    # Normalize
    vvec = algos.sparse_normalize_rows(tfidf_vvec)
    return vvec, fx2_wx
コード例 #3
0
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs,
                                   cache_dir):
    '''Assigns each database chip a visual-vector and returns 
       data for the inverted file'''
    # TODO: Save precomputations here
    print('[mc2] Assigning each database chip a bag-of-words vector')
    num_indexed = len(indexed_cxs)
    ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs)
    # Build UID
    matcher_uid = params.get_matcher_uid()
    data_uid = helpers.hashstr(ax2_desc)
    uid = data_uid + '_' + matcher_uid
    try:
        cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl')  #sparse
        wx2_cxs = io.smart_load(cache_dir, 'wx2_cxs', uid, '.npy')
        wx2_fxs = io.smart_load(cache_dir, 'wx2_fxs', uid, '.npy')
        wx2_idf = io.smart_load(cache_dir, 'wx2_idf', uid, '.npy')
        print('[mc2] successful cache load: vocabulary indexed databased.')
        return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
    #helpers.CacheException as ex:
    except IOError as ex:
        print(repr(ex))

    print('[mc2] quantizing each descriptor to a word')
    # Assign each descriptor to its nearest visual word
    print('[mc2] ...this may take awhile with no indication of progress')
    tt1 = helpers.Timer('quantizing each descriptor to a word')
    ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128)
    tt1.toc()
    # Build inverse word to ax
    tt2 = helpers.Timer('database_indexing')
    print('')
    print('[mc2] building inverse word to ax map')
    wx2_axs = [[] for _ in xrange(len(words))]
    for ax, wx in enumerate(ax2_wx):
        wx2_axs[wx].append(ax)
    # Compute inverted file: words -> database
    print('[mc2] building inverted file word -> database')
    wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs])
    wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs])
    # Build sparse visual vectors with term frequency weights
    print('[mc2] building sparse visual words')
    coo_cols = ax2_wx
    coo_rows = ax2_cx
    coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE)
    coo_format = (coo_values, (coo_rows, coo_cols))
    coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True)
    cx2_tf_vvec = spsparse.csr_matrix(coo_cx2_vvec, copy=False)
    # Compute idf_w = log(Number of documents / Number of docs containing word_j)
    print('[mc2] computing tf-idf')
    wx2_df = np.array([len(set(cxs)) + 1 for cxs in wx2_cxs], dtype=np.float)
    wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df))
    # Compute tf-idf
    print('[mc2] preweighting with tf-idf')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    # Normalize
    print('[mc2] normalizing')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec)
    tt2.toc()
    # Save to cache
    print('[mc2] saving to cache')
    r'''
    input_data = ax2_desc
    data = cx2_vvec
    uid='cx2_vvec'+matcher_uid
    '''
    io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl')  #sparse
    io.smart_save(wx2_cxs, cache_dir, 'wx2_cxs', uid, '.npy')
    io.smart_save(wx2_fxs, cache_dir, 'wx2_fxs', uid, '.npy')
    io.smart_save(wx2_idf, cache_dir, 'wx2_idf', uid, '.npy')
    return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
コード例 #4
0
ファイル: bagofwords.py プロジェクト: Erotemic/hotspotter
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs, cache_dir):
    '''Assigns each database chip a visual-vector and returns 
       data for the inverted file'''
    # TODO: Save precomputations here
    print('[mc2] Assigning each database chip a bag-of-words vector')
    num_indexed = len(indexed_cxs)
    ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs)
    # Build UID
    matcher_uid  = params.get_matcher_uid()
    data_uid = helpers.hashstr(ax2_desc)
    uid = data_uid + '_' + matcher_uid
    try: 
        cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse
        wx2_cxs  = io.smart_load(cache_dir, 'wx2_cxs',  uid, '.npy')
        wx2_fxs  = io.smart_load(cache_dir, 'wx2_fxs',  uid, '.npy')
        wx2_idf  = io.smart_load(cache_dir, 'wx2_idf',  uid, '.npy')
        print('[mc2] successful cache load: vocabulary indexed databased.')
        return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
    #helpers.CacheException as ex:
    except IOError as ex:
        print(repr(ex))

    print('[mc2] quantizing each descriptor to a word')
    # Assign each descriptor to its nearest visual word
    print('[mc2] ...this may take awhile with no indication of progress')
    tt1 = helpers.Timer('quantizing each descriptor to a word')
    ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128)
    tt1.toc()
    # Build inverse word to ax
    tt2 = helpers.Timer('database_indexing')
    print('')
    print('[mc2] building inverse word to ax map')
    wx2_axs = [[] for _ in xrange(len(words))]
    for ax, wx in enumerate(ax2_wx):
        wx2_axs[wx].append(ax)
    # Compute inverted file: words -> database
    print('[mc2] building inverted file word -> database')
    wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs])
    wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs])
    # Build sparse visual vectors with term frequency weights 
    print('[mc2] building sparse visual words')
    coo_cols = ax2_wx  
    coo_rows = ax2_cx
    coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE)
    coo_format = (coo_values, (coo_rows, coo_cols))
    coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True)
    cx2_tf_vvec  = spsparse.csr_matrix(coo_cx2_vvec, copy=False)
    # Compute idf_w = log(Number of documents / Number of docs containing word_j)
    print('[mc2] computing tf-idf')
    wx2_df  = np.array([len(set(cxs))+1 for cxs in wx2_cxs], dtype=np.float)
    wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df))
    # Compute tf-idf
    print('[mc2] preweighting with tf-idf')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    # Normalize
    print('[mc2] normalizing')
    cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf)
    cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec)
    tt2.toc()
    # Save to cache
    print('[mc2] saving to cache')
    r'''
    input_data = ax2_desc
    data = cx2_vvec
    uid='cx2_vvec'+matcher_uid
    '''
    io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse
    io.smart_save(wx2_cxs,  cache_dir, 'wx2_cxs',  uid, '.npy')
    io.smart_save(wx2_fxs,  cache_dir, 'wx2_fxs',  uid, '.npy')
    io.smart_save(wx2_idf,  cache_dir, 'wx2_idf',  uid, '.npy')
    return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf