Python LSHashの例、lshash.lshash.LSHash Pythonの例

コード例 #1

0

ファイルを表示

def lsh(p_hash_size, distance_func):
    """
    实现局部敏感哈希模拟KNN的具体函数
    :param p_hash_size: 与vipno的总数（去重后）相乘构成最终的hash_size
    :param distance_funcs: 可选择的距离计算函数
    :return: 去除自身之后的该vipno对应knn的输出vipno
    """
    datas_set, datas_matrix = get_data()
    # vipno_nums 为vipno去重后的总数
    vipno_nums = len(datas_matrix[0])

    # 随机取一个vipno（这里是vipno对应的下标）
    random_vipno = random.randint(0, vipno_nums - 1)

    # 初始化lshash
    lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0]))
    for i in range(vipno_nums):
        # extra_data为当前列对应的vipno值，作为之后输出的时候所想要的knn的输出vipno
        lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i])

    vipno_res = []
    # num_results可以限制输出的结果个数，这里取前6个，因为第一个为输入列本身
    for res in lsh.query(datas_matrix[:, random_vipno],
                         num_results=6,
                         distance_func=distance_func):
        vipno_res.append(res[0][1])

    print("distance func:", distance_func)
    print("knn output(from 1 to 5): {}".format(vipno_res[1:]))

    return vipno_res[1:], datas_set.columns[random_vipno]

コード例 #2

0

ファイルを表示

ファイル: app.py プロジェクト: vizlab-fsc/phylo

def build_index():
    lsh = LSHash(32, HASH_SIZE**2)
    q = session.query(models.Image.hash, models.Image.id)
    for h, id in q.all():
        h = hash.hex_to_hash(h).hash.flatten()
        bytearr = h.view(np.uint8)
        lsh.index(bytearr, extra_data=id)
    return lsh

コード例 #3

0

ファイルを表示

    def constructfeature(self, hash_size, input_dim, num_hashtables):
        multiscale = '[1]'
        print(">> Loading network:\n>>>> '{}'".format(self.network))
        # state = load_url(PRETRAINED[args.network], model_dir=os.path.join(get_data_root(), 'networks'))
        state = torch.load(self.network)
        # parsing net params from meta
        # architecture, pooling, mean, std required
        # the rest has default values, in case that is doesnt exist
        net_params = {}
        net_params['architecture'] = state['meta']['architecture']
        net_params['pooling'] = state['meta']['pooling']
        net_params['local_whitening'] = state['meta'].get('local_whitening', False)
        net_params['regional'] = state['meta'].get('regional', False)
        net_params['whitening'] = state['meta'].get('whitening', False)
        net_params['mean'] = state['meta']['mean']
        net_params['std'] = state['meta']['std']
        net_params['pretrained'] = False
        # network initialization
        net = init_network(net_params)
        net.load_state_dict(state['state_dict'])
        print(">>>> loaded network: ")
        print(net.meta_repr())
        # setting up the multi-scale parameters
        ms = list(eval(multiscale))
        print(">>>> Evaluating scales: {}".format(ms))
        # moving network to gpu and eval mode
        if torch.cuda.is_available():
            net.cuda()
        net.eval()

        # set up the transform
        normalize = transforms.Normalize(
            mean=net.meta['mean'],
            std=net.meta['std']
        )
        transform = transforms.Compose([
            transforms.ToTensor(),
            normalize
        ])

        # extract database and query vectors
        print('>> database images...')
        images = ImageProcess(self.img_dir).process()
        vecs, img_paths = extract_vectors(net, images, 1024, transform, ms=ms)
        feature_dict = dict(zip(img_paths, list(vecs.detach().cpu().numpy().T)))
        # index
        lsh = LSHash(hash_size=int(hash_size), input_dim=int(input_dim), num_hashtables=int(num_hashtables))
        for img_path, vec in feature_dict.items():
            lsh.index(vec.flatten(), extra_data=img_path)

        # ## 保存索引模型
        # with open(self.feature_path, "wb") as f:
        #     pickle.dump(feature_dict, f)
        # with open(self.index_path, "wb") as f:
        #     pickle.dump(lsh, f)

        print("extract feature is done")
        return feature_dict, lsh

コード例 #4

0

ファイルを表示

ファイル: Question1.py プロジェクト: DigitalSprite/DataMiningClass

def getLSHashOutput(filename, hash_size, k):
    matrix = getMatrix(filename)
    total_num = len(matrix.iloc[0])
    lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0]))
    for i in range(total_num):
        lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i])
    out_num = rand.randint(0, total_num - 1)
    #有多种lshash函数，默认是euclidean
    print(lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean'))

コード例 #5

0

ファイルを表示

def sphere():
    X = np.random.normal(size=(1000, 3))
    lsh = LSHash(10, 3, num_hashtables=5)
    for x in X:
        x /= np.linalg.norm(x)
        lsh.index(x)
    closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine")
    assert len(closest) >= 10
    assert 0.05 >= closest[9][-1] > 0.0003

コード例 #6

0

ファイルを表示

    def build_index(self, k: int, hash_size: int = 10, num_hashtables: int = 1,
                    store_file: str = None, overwrite: bool = False):
        """Build index for each picture.
        First use K-means to find k key features from previously extracted features and the assignment of each feature;

        Then apply histogram on each image, get the distribution of its features, which serves as a unique finger print for this image.

        Finally use LSHash (locality sensitive hashing.) algorithm, index each image by their histogram array.

        Args:
            k: parameter used in K-means, number of centeroids (key features).
            hash_size: length of resulting binary hash array.
            num_hashtables: number of hashtables for multiple lookups.
            store_file: Specify the path to the .npz file random matrices are stored or to be stored if the file does not exist yet
            overwrite: Whether to overwrite the matrices file if it already exist.

        Returns:

        """
        assert 0 < k < len(self._all_feats)
        assert hash_size > 0 and num_hashtables > 0

        # Use kmeans to calculate K key features and assignment of each feature.
        logger.info('Calculating {} key featurs...'.format(k))
        # Mini batch kmeans deals with large amount of data better.
        self._kmeans = MiniBatchKMeans(n_clusters=k)
        self._kmeans.fit(np.array(self._all_feats))
        idx = self._kmeans.labels_
        logger.info('Start indexing each image.')

        # Calculate histogram of each image
        self._lsh = LSHash(hash_size=hash_size,
                           input_dim=k,
                           num_hashtables=num_hashtables,
                           matrices_filename=store_file,
                           overwrite=overwrite)
        success = 0
        progress_bar = tqdm(total=len(self._img_dict))
        bins = np.arange(-0.5, k + 0.5, 1)
        for img_name, img_meta in self._img_dict.items():
            try:
                start = img_meta['start_index']
                end = start + img_meta['num_feats']
                # Perform histogram
                hist, _ = np.histogram(idx[start:end], bins=bins)
                img_meta['histogram'] = hist
                # Store each picture in hash tables
                self._lsh.index(input_point=hist, extra_data=img_name)
                success += 1
            except Exception as e:
                logger.warning(e)
                logger.warning('Error when indexing image: {}'.format(img_name))
            progress_bar.update(1)
        progress_bar.close()
        logger.info('Successfully indexed {} images.'.format(success))

コード例 #7

0

ファイルを表示

def hyperspheres(X=16, num_samples=200000):
    """ Demonstrate curse of dimensionality and where LSH starts to fail

    Returns:
      lsh, X, secondclosest, tenthclosest

    >>> import pandas as pd
    >>> lsh, vectors, dfs = hyperspheres(16)
    >>> for df in dfs:
    ...     print(df)
    """
    X = np.random.uniform(size=(num_samples, X)) if isinstance(X, int) else X
    closest = []
    secondclosest = []
    tenthclosest = []
    hundredthclosest = []
    for D in range(2, X.shape[1] + 1):
        lsh = LSHash(int(64 / D) + D, D, num_hashtables=D)

        # query vector
        q = np.random.uniform(size=(D,))
        q /= np.linalg.norm(q)

        distances = []
        for x in X[:, :D]:
            x /= np.linalg.norm(x)
            distances += [1. - np.sum(x * q)]  # cosine similarity
            lsh.index(x)
        distances = sorted(distances)
        print(distances[:10])
        closest10 = lsh.query(q, distance_func='cosine')

        N = len(closest10)
        hundredthclosest += [[D, N, closest10[min(99, N - 1)][-1] if N else 2., distances[min(99, N - 1)]]]
        tenthclosest += [[D, N, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]]
        secondclosest += [[D, N, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]]
        closest += [[D, N, closest10[0][-1] if N else 2., distances[0]]]
        print("is correct: 100th 10th 2nd 1st")
        print(round(hundredthclosest[-1][-1], 14) == round(hundredthclosest[-1][-2], 14))
        print(round(tenthclosest[-1][-1], 14) == round(tenthclosest[-1][-2], 14))
        print(round(secondclosest[-1][-1], 14) == round(secondclosest[-1][-2], 14))
        print(round(closest[-1][-1], 14) == round(closest[-1][-2], 14))
        print("distances: 100th 10th 2nd 1st")
        print(hundredthclosest[-1])
        print(tenthclosest[-1])
        print(secondclosest[-1])
        print(closest[-1])
    dfs = []
    for k, (i, df) in enumerate(zip([100, 10, 2, 1], [hundredthclosest, tenthclosest, secondclosest, closest])):
        df = pd.DataFrame(df, columns='D N dist{} true_dist{}'.format(i, i).split()).round(14)
        df['correct{}'.format(i)] = df['dist{}'.format(i)] == df['true_dist{}'.format(i)]
        dfs += [df]
    # for i, tc in enumerate(tenthclosest):
    #     assert 1e-9 < tc[-2] or 1e-6 < 0.2
    return lsh, X, dfs

コード例 #8

0

ファイルを表示

ファイル: compare.py プロジェクト: jinzitian/LSHCos

def b(r, dim, vector):
    lsh = LSHash(r, dim)
    for n, v in xxx:
        lsh.index(v.tolist())
    start = time.perf_counter()
    q = lsh.query(vector.tolist(), 10, 'cosine')
    end = time.perf_counter()
    qq = [(x, 1 - y) for x, y in q]
    if len(qq) > 0:
        return qq[0][1], end - start
    else:
        return -2, end - start

コード例 #9

0

ファイルを表示

ファイル: save_embeddings.py プロジェクト: kavithacd/self_supervised_learning

def save_embedding_hash(hash_params, save_path, img_names, features_dict):
    ## Locality Sensitive Hashing
    # params
    k = hash_params['hash_size']  # hash size
    L = hash_params['num_tables']  # number of tables
    d = hash_params['dim']  # Dimension of Feature vector
    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)

    # LSH on all the images
    for img_path, vec in tqdm(features_dict.items()):
        lsh.index(vec.flatten(), extra_data=img_path)
    ## Exporting as pickle
    pickle.dump(lsh, open(save_path, "wb"))

コード例 #10

0

ファイルを表示

def lsh(p_hash_size, distance_funcs):
    """
    实现局部敏感哈希模拟KNN的具体函数
    :param p_hash_size: 与vipno的总数（去重后）相乘构成最终的hash_size
    :param distance_funcs: 可选择的距离计算函数
    :return: 去除自身之后的该vipno对应knn的输出vipno
    """
    datas_set, datas_matrix = get_data()
    # vipno_nums 为vipno去重后的总数
    vipno_nums = len(datas_matrix[0])

    # 随机取一个vipno（这里是vipno对应的下标）
    random_vipno = random.randint(0, vipno_nums - 1)

    # 初始化lshash
    lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0]))
    for i in range(vipno_nums):
        # extra_data为当前列对应的vipno值，作为之后输出的时候所想要的knn的输出vipno
        lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i])

    print("hash size: {}".format(vipno_nums * p_hash_size))
    # print("distance func:", distance_func)
    print("input vipno: {}".format(datas_set.columns[random_vipno]))
    # vipno_res = []

    ends = []
    for distance_func in distance_funcs:
        start = datetime.datetime.now()
        vipno_res = []
        # num_results可以限制输出的结果个数，这里取前6个，因为第一个为输入列本身
        for res in lsh.query(datas_matrix[:, random_vipno],
                             num_results=6,
                             distance_func=distance_func):
            vipno_res.append(res[0][1])
        end = (datetime.datetime.now() - start).total_seconds()
        ends.append(end)
        print("distance func:", distance_func)
        print("knn output(from 1 to 5): {}".format(vipno_res[1:]))
        print("time:", end)

    # 做时间性能比较图
    plt.bar(distance_funcs,
            ends,
            alpha=0.9,
            width=0.35,
            facecolor='lightskyblue',
            edgecolor='white',
            label='time',
            lw=1)
    plt.legend(loc="upper left")
    plt.show()

コード例 #11

0

ファイルを表示

ファイル: test_spheres.py プロジェクト: totalgood/LSHash

def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))):
    """ Demonstrate curse of dimensionality and where LSH starts to fail

    Returns:
      lsh, X, secondclosest, tenthclosest

    >>> import pandas as pd
    >>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres()
    >>> pd.DataFrame(rank2)
    >>> pd.DataFrame(rank10)
    """
    tenthclosest = []
    secondclosest = []
    closest = []
    for D in range(2, X.shape[1]):
        lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D)

        # query vector
        q = np.random.uniform(size=(D, ))
        q /= np.linalg.norm(q)

        distances = []
        for x in X[:, :D]:
            lsh.index(x)
            x /= np.linalg.norm(x)
            distances += [1. - np.sum(x * q)]  # cosine similarity
        distances = sorted(distances)
        print(distances[:10])
        closest10 = lsh.query(q, distance_func='cosine')

        N = len(closest10)
        tenthclosest += [[
            D,
            min(9, N - 1) if N else -1,
            closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]
        ]]
        secondclosest += [[
            D,
            min(1, N - 1) if N else -1,
            closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]
        ]]
        closest += [[
            D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0]
        ]]
        print(tenthclosest[-1])
        print(secondclosest[-1])
        print(closest[-1])
    # for i, tc in enumerate(tenthclosest):
    #     assert 1e-9 < tc[-2] or 1e-6 < 0.2
    return lsh, X, closest, secondclosest, tenthclosest

コード例 #12

0

ファイルを表示

ファイル: stitch_tiles.py プロジェクト: afruehstueck/MapStitch

def generate_lsh(images):
    num_images = images.shape[0]
    image_size = images.shape[1]

    engine = LSHash(8, image_size)

    # create locality sensitive hash from all files
    for idx, image in enumerate(tqdm(images)):

        (top, right, bottom, left) = get_all_edges_from_array(image)
        engine.index(top, extra_data=(idx, TOP))
        engine.index(right, extra_data=(idx, RIGHT))
        engine.index(bottom, extra_data=(idx, BOTTOM))
        engine.index(left, extra_data=(idx, LEFT))
    return engine

コード例 #13

0

ファイルを表示

def knn(df, k, coefficient):
    hash_size = int(coefficient * df.shape[1])
    lsh = LSHash(hash_size, input_dim=df.shape[0])
    for vipno in df:
        lsh.index(df[vipno], extra_data=vipno)
    random_column = df[df.columns.to_series().sample(1)]
    random_vip = random_column.columns.values[0]
    logging.info('random vipno: {}'.format(random_vip))
    res = lsh.query(random_column.values.flatten())[0:k + 1]
    logging.info('vipno in ranked order using kNN(k = {}):'.format(k))
    knns = []
    for item in res:
        if item[0][1] != random_vip:
            logging.info(item[0][1])
            knns.append(item[0][1])
    return random_vip, knns[:5]

コード例 #14

0

ファイルを表示

ファイル: ls_hash.py プロジェクト: DigitalSprite/2018_-Data-Mining-Project-

def getLSHashOutput(filename, hash_size, k):
    matrix = getMatrix(filename)
    list = []
    for i in range(matrix.shape[1]):
        list.append(matrix.iloc[i])
    total_num = len(matrix.iloc[0])
    lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0]))
    for i in range(total_num):
        lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i])
    out_num = rand.randint(0, total_num - 1)
    #有多种lshash函数，默认是euclidean
    m = lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean')
    print("输入的vipno是" + str(matrix.columns[out_num]) + "\n其桶中的vipno有：")
    bucket = []
    for i in range(len(m)):
        print(m[i][0][1])
        tag = np.argwhere(matrix.columns == m[i][0][1])
        bucket.append(int(tag))
    return bucket

コード例 #15

0

ファイルを表示

ファイル: LShash_util.py プロジェクト: 39239580/res_sys_tool-new-

    def __init__(self,
                 hash_size,
                 input_dim,
                 num_of_hashtables=1,
                 storage=None,
                 matrices_filename=None,
                 overwrite=False):
        """
        Attributes:
        :param hash_size:
            The length of the resulting binary hash in integer.E.g., 32 means the resulting binary hash will be 32 - bit long.

        :param input_dim:
            The dimension of the input vector.E.g., a grey - scale picture of 30x30 pixels will have an input dimension of 900.

        :param num_hashtables:
            (optional) The number of hash tables used for multiple lookups.

        :param storage_config:
            (optional) A dictionary of the form `{backend_name: config}` where `backend_name` is the either `dict` or `redis`,
            and `config` is the configuration used by the backend.
            For `redis`it should be in the format of`{"redis": {"host": hostname, "port": port_num}}`,
            where `hostname` is normally `localhost` and `port` is normally 6379.

        :param matrices_filename:
            (optional) Specify the path to the compressed numpy file endin with extension `.npz`, where the uniform random planes
            are stored, or to be stored if the file does not exist yet.

        :paramoverwrite:
            (optional) Whether to overwrite the matrices file if it already exist
        """
        self.hash_object = LSHash(
            hash_size=hash_size,  # 二进制hash  结果的长度
            input_dim=input_dim,  # 输入向量的维度
            num_of_hashtables=num_of_hashtables,  # 用于多次查找的哈希表的数目。可选项
            storage=storage,  # (可选)指定用于索引存储的存储的名称。选项包括“redis”
            matrices_filename=
            matrices_filename,  # (可选)指定.npz文件的路径随机矩阵被存储, 如果文件不存在
            overwrite=overwrite)  # 如果matrices文件存在，是否对其进行覆盖， 可选项

コード例 #16

0

ファイルを表示

ファイル: multiprocess_predict.py プロジェクト: DableUTeeF/seven2

def predict_image_class(query_folder, target_features, cache_dict, class_minimum):
    minimum = (float('inf'), 0)
    for query_image_path in os.listdir(os.path.join(query_path, query_folder)):
        t = time.time()
        query = os.path.join(query_path, query_folder, query_image_path)
        t1 = time.time() - t
        cache_dict, query_features = memory_cache(cache_dict, model.model, query, os.path.join(cache_path, query_folder, query_image_path + '.pth'), transform)
        t2 = time.time() - t
        y = LSHash.euclidean_dist(target_features.cpu().numpy()[0], query_features.cpu().numpy()[0])
        t3 = time.time() - t
        print(t1, t2, t3)
        if y < minimum[0]:
            minimum = (y, query_folder)
    class_minimum[query_folder] = minimum

コード例 #17

0

ファイルを表示

ファイル: lsh_fastai.py プロジェクト: thebba2013/utilities_python

feature_dict = dict(zip(img_path,sf.features))				# key val of 'image_path':'512_dim_visual_embedding'

pickle.dump(feature_dict, open(path/"feature_dict.p", "wb"))




feature_dict = pickle.load(open(path/'feature_dict.p','rb'))


## Locality Sensitive Hashing
# params
k = 10 # hash size
L = 5  # number of tables
d = 512 # Dimension of Feature vector
lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)


# LSH on all the images
# for img_path, vec in tqdm_notebook(feature_dict.items()):
for img_path, vec in (feature_dict.items()):
	print(img_path)
	print(vec)
	lsh.index(vec.flatten(), extra_data=img_path)




## Exporting as pickle
pickle.dump(lsh, open(path/'lsh.p', "wb"))

コード例 #18

0

ファイルを表示

trade_mat  # show the table

# # 2. LSH

# In[9]:

from lshash.lshash import LSHash
import random

# In[10]:

o = open('lsh_output.txt', 'w')  # create a file to write the results

# loop with different hash size
for e in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]:
    lsh = LSHash(round(n_vip * e), n_plu)
    for v in vipno:
        feature = list(trade_mat[v])
        lsh.index(feature, extra_data=v)

    # pick up a random vipno
    pick_vip = random.randint(1, n_vip)
    pick_vip = vipno[pick_vip]
    o.write("Hash_size = {} * n_plu \n".format(e))
    o.write("Pick up a vip: {}\n".format(pick_vip))

    # lsh query and write the results
    candi = lsh.query(list(trade_mat[pick_vip]), 6, distance_func='hamming')
    #     print(len(candi))
    for i, item in enumerate(candi[1:]):
        dist = item[1]

コード例 #19

0

ファイルを表示

from src.utils import generate_data
import numpy as np
from lshash.lshash import LSHash

np.random.seed(18)
dimension = 10
size = 1000
hash_size = 3
num_tables = 8

if __name__ == '__main__':
    data = generate_data(size, dimension)

    # =============
    lsh = LSH(hash_size, dimension, num_tables)
    lsh.index(data)

    lsh_1 = LSHash(hash_size, dimension, num_tables)
    # make the projection planes the same
    lsh_1.uniform_planes = lsh.projections

    for d in data:
        lsh_1.index(d)

    for i in range(num_tables):
        t1 = lsh.hash_tables[i]
        t2 = lsh_1.hash_tables[i]

        for k in t1:
            assert t1[k] == t2.get_val(k)

コード例 #20

0

ファイルを表示

random.shuffle(image_path_list)

inputShape = (224, 224)

# image_path_list = []

model = VGG16(weights='imagenet', include_top=False)
preprocess = imagenet_utils.preprocess_input

## LSHash Params

k = 10  # hash size
L = 5  # number of tables
d = 25088  # Dimension of Feature vector	from VGG16 bottleneck

lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)

for cnt, image_path in enumerate(image_path_list):
    print(cnt)
    print(image_path)
    continue
    image = load_img(image_path)
    image = image.resize(inputShape)
    image = img_to_array(image)
    image = preprocess(image)
    image = np.expand_dims(image, axis=0)
    image_pred_features = model.predict(image)[0]
    lsh.index(image_pred_features.flatten(), extra_data=image_path)

pickle.dump(lsh, open('pick_keras/lsh.p', "wb"))

コード例 #21

0

ファイルを表示

ファイル: main.py プロジェクト: yhy1993824/lsHash

    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)] += 1
        else:
            print('word:', word, 'is not in the list_vec')
    return returnvec


if __name__ == '__main__':
    datalist, classlist, vocabset = textprocess('./paper')  # 获取每篇论文的词集
    stop_word_file = './stopwords_cn.txt'
    stop_word_set = make_word_set(stop_word_file)
    feature_words = word_dict(vocabset, 0, stop_word_set)
    trainMat = []

    lsh = LSHash(hash_size=10, input_dim=len(feature_words))
    for postinDoc in datalist:
        trainMat_vec = bagof_word2vec(feature_words, postinDoc)  # 训练集向量化
        trainMat.append(trainMat_vec)
        lsh.index(trainMat_vec)

    testfile = './test.txt'
    testlist = []
    with open(testfile, 'r', encoding='utf-8') as f:
        sequence = f.read()
        testlist.append(jieba.lcut(sequence, cut_all=False))
        testvect = bagof_word2vec(feature_words, testlist[0])

    re = lsh.query(testvect, num_results=1)
    print(list(re[0][0]))
    print(trainMat.index(list(re[0][0])))

コード例 #22

0

ファイルを表示

ファイル: groupTweet.py プロジェクト: Ten000hours/WebScience

#  convert the words into word frequency matrix
vectorizer1 = CountVectorizer()

X = vectorizer1.fit_transform(corpus)
# get the keywords in corpus
word = vectorizer1.get_feature_names()


transformer = TfidfTransformer()

#  calculate the TF-IDF values
tfidf = transformer.fit_transform(X)



lsh = LSHash(6, 8)
# construct the centriodSet
centriodSet = []
Ind = 0
for i in range(0, DBconnection.DBconnection.count(a.dbconnect_to_collection()) - 1):
    centriod = []
    j = 0
    while j < (tfidf.indptr[i + 1] - tfidf.indptr[i]):
        if j > 7:
            break
        centriod.append(round(tfidf.data[Ind + j], 2))
        j += 1
    if len(centriod) < 8:
        for index in range(8 - len(centriod)):
            centriod.append(0 + 1)
    lsh.index(centriod)

コード例 #23

0

ファイルを表示

def predict():
    model = Model(ResNet(predict=True))
    model.compile(torch.optim.SGD(model.model.parameters(),
                                  lr=0.001,
                                  momentum=0.9,
                                  weight_decay=1e-4),
                  ContrastiveLoss(),
                  metric=None,
                  device='cuda')
    model.load_weights(
        '/home/palm/PycharmProjects/seven2/snapshots/pairs/5/epoch_1_0.012463876953125.pth'
    )
    model.model.eval()

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(), normalize])

    target_path = '/home/palm/PycharmProjects/seven/images/test6/train'
    query_path = '/home/palm/PycharmProjects/seven/images/cropped6/train'
    cache_path = '/home/palm/PycharmProjects/seven/caches'
    cache_dict = {}
    predicted_dict = {}
    correct = 0
    count = 0
    with torch.no_grad():
        for target_image_folder in os.listdir(target_path):
            if target_image_folder not in os.listdir(query_path):
                continue
            predicted_dict[target_image_folder] = {}
            for target_image_path in os.listdir(
                    os.path.join(target_path, target_image_folder)):
                count += 1
                target = os.path.join(target_path, target_image_folder,
                                      target_image_path)
                target_image_ori = Image.open(target)
                target_image = transform(target_image_ori)
                x = torch.zeros((1, 3, 224, 224))
                x[0] = target_image
                target_features = model.model._forward_impl(x.cuda())
                minimum = (float('inf'), 0)
                for query_folder in os.listdir(query_path):
                    for query_image_path in os.listdir(
                            os.path.join(query_path, query_folder)):
                        query = os.path.join(query_path, query_folder,
                                             query_image_path)
                        cache_dict, query_features = memory_cache(
                            cache_dict, model.model, query,
                            os.path.join(cache_path, query_folder,
                                         query_image_path + '.pth'), transform)
                        y = LSHash.euclidean_dist(
                            target_features.cpu().numpy()[0],
                            query_features.cpu().numpy()[0])
                        if y < minimum[0]:
                            minimum = (y, query_folder)
                print(*minimum, target_image_folder)
                predicted_dict[target_image_folder][
                    target_image_path] = minimum[1]
                if minimum[1] == target_image_folder:
                    correct += 1
    print(count / correct)
    pk.dump(predicted_dict, open('cls_eval.pk', 'wb'))

コード例 #24

0

ファイルを表示

class ImageSearchEngine(object):
    """A simple image search engine based on ORB, Kmeans and LSHash."""

    def __init__(self):
        self._all_feats = []
        self._img_dict = {}
        self._kmeans = None
        self._lsh = None

    def load_images(self, img_list: list) -> int:
        """Load images, extract features using ORB for indexing.

        Args:
            img_list: list of image files' names.

        Returns:
            count of image files successfully loaded.
        """
        count = 0
        progress_bar = tqdm(total=len(img_list))
        for img_name in img_list:
            try:
                img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE)
                _, features = orb.detectAndCompute(img, None)
                # Record index of features for this image
                start_index, num_feats = len(self._all_feats), len(features)
                self._img_dict[img_name] = {'start_index': start_index,
                                            'num_feats': num_feats}
                # Append new featurs
                self._all_feats.extend([feat for feat in features])
                count += 1
            except Exception as e:
                logger.warning(e)
                logger.warning('Error processing {}'.format(img_name))
            progress_bar.update(1)
        progress_bar.close()
        logger.info('Successfully loaded {} images, extracted {} features.'
                    .format(count, len(self._all_feats)))
        return count

    def build_index(self, k: int, hash_size: int = 10, num_hashtables: int = 1,
                    store_file: str = None, overwrite: bool = False):
        """Build index for each picture.
        First use K-means to find k key features from previously extracted features and the assignment of each feature;

        Then apply histogram on each image, get the distribution of its features, which serves as a unique finger print for this image.

        Finally use LSHash (locality sensitive hashing.) algorithm, index each image by their histogram array.

        Args:
            k: parameter used in K-means, number of centeroids (key features).
            hash_size: length of resulting binary hash array.
            num_hashtables: number of hashtables for multiple lookups.
            store_file: Specify the path to the .npz file random matrices are stored or to be stored if the file does not exist yet
            overwrite: Whether to overwrite the matrices file if it already exist.

        Returns:

        """
        assert 0 < k < len(self._all_feats)
        assert hash_size > 0 and num_hashtables > 0

        # Use kmeans to calculate K key features and assignment of each feature.
        logger.info('Calculating {} key featurs...'.format(k))
        # Mini batch kmeans deals with large amount of data better.
        self._kmeans = MiniBatchKMeans(n_clusters=k)
        self._kmeans.fit(np.array(self._all_feats))
        idx = self._kmeans.labels_
        logger.info('Start indexing each image.')

        # Calculate histogram of each image
        self._lsh = LSHash(hash_size=hash_size,
                           input_dim=k,
                           num_hashtables=num_hashtables,
                           matrices_filename=store_file,
                           overwrite=overwrite)
        success = 0
        progress_bar = tqdm(total=len(self._img_dict))
        bins = np.arange(-0.5, k + 0.5, 1)
        for img_name, img_meta in self._img_dict.items():
            try:
                start = img_meta['start_index']
                end = start + img_meta['num_feats']
                # Perform histogram
                hist, _ = np.histogram(idx[start:end], bins=bins)
                img_meta['histogram'] = hist
                # Store each picture in hash tables
                self._lsh.index(input_point=hist, extra_data=img_name)
                success += 1
            except Exception as e:
                logger.warning(e)
                logger.warning('Error when indexing image: {}'.format(img_name))
            progress_bar.update(1)
        progress_bar.close()
        logger.info('Successfully indexed {} images.'.format(success))

    def search(self, img_name: str, num_results: int = None,
               distance_func: str = None) -> list:
        """Search image.

        Args:
            img_name: name of image file to searched.
            num_results: The number of query results to return in ranked order. By default all results will be returned.
            distance_func: The distance function to be used, in ("hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm").
                By default "euclidean" will used.

        Returns:
            list of names of match images.
        """
        assert self._lsh is not None and self._kmeans is not None
        res = []
        try:
            img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE)
            _, features = orb.detectAndCompute(img, None)
            idx = self._kmeans.predict(features)
            bins = np.arange(-0.5, len(self._kmeans.cluster_centers_) + 0.5, 1)
            hist, _ = np.histogram(idx, bins=bins)
            res = self._lsh.query(hist, num_results=num_results,
                                  distance_func=distance_func)
        except Exception as e:
            logger.warning(e)
        return res

    def dump(self, pkl_file: str = 'model.pkl'):
        with open(pkl_file, 'wb') as f:
            pickle.dump(self, f)

    @property
    def num_images(self) -> int:
        return len(self._img_dict)

コード例 #25

0

ファイルを表示

                                  weight_decay=1e-4),
                  ContrastiveLoss(),
                  metric=None,
                  device='cuda')
    model.load_weights(
        '/home/palm/PycharmProjects/seven2/snapshots/pairs/4/epoch_0_0.016697616640688282.pth'
    )
    model.model.eval()

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(), normalize])

    lsh = LSHash(hash_size=16, input_dim=1024, num_hashtables=5)

    target_path = '/home/palm/PycharmProjects/seven/images/cropped2/unknown/obj'
    query_path = '/home/palm/PycharmProjects/seven/images/cropped2/train'
    cache_path = '/home/palm/PycharmProjects/seven/caches'
    cache_dict = {}
    with torch.no_grad():
        for target_image_path in os.listdir(target_path):
            target = os.path.join(target_path, target_image_path)
            target_image_ori = Image.open(target)
            target_image = transform(target_image_ori)
            x = torch.zeros((1, 3, 224, 224))
            x[0] = target_image
            target_features = model.model._forward_impl(x.cuda())
            minimum = (float('inf'), 0)
            for query_folder in os.listdir(query_path):

コード例 #26

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: DableUTeeF/seven2

                                  momentum=0.9,
                                  weight_decay=1e-4),
                  ContrastiveLoss(),
                  metric=None,
                  device='cuda')
    model.load_weights(
        '/home/palm/PycharmProjects/seven2/snapshots/pairs/3/epoch_0_0.03454810580774366.pth'
    )
    model.model.eval()
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(), normalize])

    lsh = LSHash(hash_size=16, input_dim=1024, num_hashtables=5)

    cache_folder = '/home/palm/PycharmProjects/seven/caches'
    with torch.no_grad():
        target_image_ori = Image.open(
            '/home/palm/PycharmProjects/seven/images/cropped2/unknown/obj/0_036.jpg'
        )
        target_image = transform(target_image_ori)
        x = torch.zeros((1, 3, 224, 224))
        x[0] = target_image
        target_features = model.model._forward_impl(x.cuda()).cpu()
        minimum = (float('inf'), 0)
        ts = []
        # for class_folder in os.listdir(cache_folder):
        #     for file in os.listdir(os.path.join(cache_folder, class_folder)):
        #         cache = torch.load(os.path.join(cache_folder, class_folder, file)).cpu()

コード例 #27

0

ファイルを表示

ファイル: LShash_util.py プロジェクト: 39239580/res_sys_tool-new-

def test_lshash():
    lsh = LSHash(6, 8)  # 对于输入数据为8维的数据创建6位hash
    lsh.index([1, 2, 3, 4, 5, 6, 7, 8])
    lsh.index([2, 3, 4, 5, 6, 7, 8, 9])
    lsh.index([10, 12, 99, 1, 5, 31, 2, 3])
    print(lsh.query([1, 2, 3, 4, 5, 6, 7, 7]))

コード例 #28

0

ファイルを表示

ファイル: LShash_util.py プロジェクト: 39239580/res_sys_tool-new-

class LocalSensitiveHash(object):
    def __init__(self,
                 hash_size,
                 input_dim,
                 num_of_hashtables=1,
                 storage=None,
                 matrices_filename=None,
                 overwrite=False):
        """
        Attributes:
        :param hash_size:
            The length of the resulting binary hash in integer.E.g., 32 means the resulting binary hash will be 32 - bit long.

        :param input_dim:
            The dimension of the input vector.E.g., a grey - scale picture of 30x30 pixels will have an input dimension of 900.

        :param num_hashtables:
            (optional) The number of hash tables used for multiple lookups.

        :param storage_config:
            (optional) A dictionary of the form `{backend_name: config}` where `backend_name` is the either `dict` or `redis`,
            and `config` is the configuration used by the backend.
            For `redis`it should be in the format of`{"redis": {"host": hostname, "port": port_num}}`,
            where `hostname` is normally `localhost` and `port` is normally 6379.

        :param matrices_filename:
            (optional) Specify the path to the compressed numpy file endin with extension `.npz`, where the uniform random planes
            are stored, or to be stored if the file does not exist yet.

        :paramoverwrite:
            (optional) Whether to overwrite the matrices file if it already exist
        """
        self.hash_object = LSHash(
            hash_size=hash_size,  # 二进制hash  结果的长度
            input_dim=input_dim,  # 输入向量的维度
            num_of_hashtables=num_of_hashtables,  # 用于多次查找的哈希表的数目。可选项
            storage=storage,  # (可选)指定用于索引存储的存储的名称。选项包括“redis”
            matrices_filename=
            matrices_filename,  # (可选)指定.npz文件的路径随机矩阵被存储, 如果文件不存在
            overwrite=overwrite)  # 如果matrices文件存在，是否对其进行覆盖， 可选项

    # 从给定的局部敏感hash实例中索引数据点
    def lsh_index(self, input_point, extra_data=None):
        """
        :param input_point:  为一个数组或远祖，大小为input_dim维度
        :param extra_data: 可选项，附加数据将与input_point一起添加。
        :return:
        """
        self.hash_object.index(input_point=input_point, extra_data=extra_data)

    # 根据给定的LSHash 实例检索一个数据点
    def lsh_query(self,
                  query_point,
                  num_results=None,
                  distance_fun="euclidean"):
        assert distance_fun in {
            "hamming", "euclidean", "true_euclidean", "centred_euclidean",
            "cosine", "l1norm"
        }
        """
        :param query_point:  检索的数据殿是一个数组或元组，大小为input_dim
        :param num_results:  # (可选)按顺序返回的查询结果的数量。默认情况下，将返回所有结果。
        :param distance_fun: # （可选）排序距离函数用于排序候选集， 默认使用的欧氏距离
        距离可使用的参数
        ("hamming",   汉明距离
         "euclidean",  欧式距离
         "true_euclidean", 真欧式距离
         "centred_euclidean",  中心欧式距离
         "cosine",  余弦距离
         "l1norm") l1 正则化
        :return:
        """
        return self.hash_object.query(query_point=query_point,
                                      num_results=num_results,
                                      distance_func=distance_fun)

コード例 #29

0

ファイルを表示

         target_image = transform(target_image_ori)
         x = torch.zeros((1, 3, 224, 224))
         x[0] = target_image
         target_features = model.model._forward_impl(x.cuda())
         for query_folder in os.listdir(query_path):
             for query_image_path in os.listdir(
                     os.path.join(query_path, query_folder)):
                 query = os.path.join(query_path, query_folder,
                                      query_image_path)
                 cache_dict, query_features = memory_cache(
                     cache_dict, model.model, query,
                     os.path.join(cache_path, query_folder,
                                  query_image_path + '.pth'),
                     transform)
                 y = LSHash.euclidean_dist(
                     target_features.cpu().numpy()[0],
                     query_features.cpu().numpy()[0])
                 if y < minimum[0]:
                     minimum = (y, query_folder)
     if minimum[0] > 1:
         minimum = (minimum[0], 'obj')
     # print(minimum)
     obj = ET.SubElement(root, 'object')
     ET.SubElement(obj, 'name').text = minimum[1]
     bndbx = ET.SubElement(obj, 'bndbox')
     ET.SubElement(bndbx, 'xmin').text = str(b[0])
     ET.SubElement(bndbx, 'ymin').text = str(b[1])
     ET.SubElement(bndbx, 'xmax').text = str(b[2])
     ET.SubElement(bndbx, 'ymax').text = str(b[3])
 print(time.time() - start_time)
 # cv2.imshow(f'im_{i}', draw)

コード例 #30

0

ファイルを表示

import gym
import numpy as np
from PIL import Image
from lshash.lshash import LSHash
from collections import deque
from random import random
from diskcache import FanoutCache, Cache
qtable = Cache('cache')
qtable.clear()

env = gym.make('Breakout-v0')
lshs = LSHash(500, 8192)
LEARNING_RATE = 0.15
DISCOUNT = 0.95
EPISODES = 25000

def preprocess(obs):
    image = Image.fromarray(observation)
    image = image.resize((64, 64))
    image = image.convert(mode='1')
    array = np.array(image, dtype=np.uint8).flatten()
    return array

def get_action(obs_seq):
    query = lshs.query(obs_seq, num_results=1)
    if len(query) <= 0:
        lshs.index(obs_seq)
        actions = np.ones(env.action_space.n)
        qtable[obs_seq] = actions
    elif query[0][1] >= 10:
        lshs.index(obs_seq)