Exemple #1
0
def get_hash2img():
    if os.path.exists(redis_rdb):
        lsh = LSHash(hash_len, 960, storage_config=redis_config,
                     matrices_filename=matrices_file)
        return lsh
    else:
        return create_hash2img()
def traceLSHash(queryName, hashSize):
    #queryName ="hamming_query_12_3"
    #需要进行hashQuery的轨迹index
    indexList = [14, 249, 479, 689, 899]

    XYMatrix = DateTransform()

    resultList = []
    nearList = []

    lsh = LSHash(hashSize, 44107)
    tid = 1

    for traceList in XYMatrix:
        lsh.index(input_point=traceList, extra_data=tid)
        tid += 1

    resultFile = open(queryName + '.txt', 'w')

    for index in indexList:
        queryList = lsh.query(XYMatrix[index], distance_func="hamming")
        for result in queryList:
            resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str(
                result[1]) + "\n"
            nearList.append(result[0][1])
            resultFile.write(resultStr)
        resultList.append(nearList)
        nearList = []

    resultFile.close()

    writeHTML(resultList, queryName, "hashQuerry")
    print resultList
Exemple #3
0
    def test_lshash_redis(self):
        """
        Test external lshash module
        """
        config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
        lsh = LSHash(self.hash_size, self.input_dim, 1, config)
        for i in range(self.nb_elements):
            lsh.index(list(self.els[i]))
            lsh.index(
                list(self.els[i])
            )  # multiple insertions should be prevented by the library

        hasht = lsh.hash_tables[0]
        itms = [hasht.get_list(k) for k in hasht.keys()]

        for itm in itms:
            for el in itm:
                assert itms.count(
                    itm) == 1  # have multiple insertions been prevented?
                assert el in self.els

        for el in self.els:
            res = lsh.query(list(el), num_results=1,
                            distance_func='euclidean')[0]
            el_v, el_dist = res
            assert el_v in self.els
            assert el_dist == 0
        del lsh
Exemple #4
0
def init_lsh(args):

    d = int(args.d)
    nuse = int(args.n)
    off = int(args.o)
    random_dims = int(args.r)

    random_sampling = True
    if args.q == 'y':
        random_sampling = False

    lsh = LSHash(64,
                 d,
                 random_sampling,
                 args.t,
                 args.u,
                 args.host,
                 random_dims,
                 1,
                 storage_config=args.s,
                 matrices_filename='project_plane.npz')

    np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off,
                                    args.i)

    return (lsh, np_feature_vecs)
Exemple #5
0
 def test_lshash_redis_extra_val(self):
     """
     Test external lshash module
     """
     config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
     lsh = LSHash(self.hash_size, self.input_dim, 1, config)
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]), self.el_names[i])
         lsh.index(list(self.els[i]),
                   self.el_names[i])  # multiple insertions
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         assert itms.count(itm) == 1
         for el in itm:
             assert el[0] in self.els
             assert el[1] in self.el_names
     for el in self.els:
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # vector an name are in the first element of the tuple res[0]
         el_v, el_name = res[0]
         # the distance is in the second element of the tuple
         el_dist = res[1]
         assert el_v in self.els
         assert el_name in self.el_names
         assert el_dist == 0
     del lsh
Exemple #6
0
def write_json_lsh(hash_size, grid):
    '''
    将生成的lsh路径放入json并存储
    :param hash_size: hash size列表
    :param grid: 处理完的栅格数组
    :return: none
    '''
    data_lsh = {}
    for size in hash_size:
        print size
        print 'list'
        data_lsh[size] = []
        lsh = LSHash(size, 44107)
        count = 0
        for line in grid:
            lsh.index(line, extra_data=count)
            count += 1
        for id in road_id:
            roads = []
            res = lsh.query(grid[id])
            print len(res)
            for r in res:
                roads.append(pack_data(r[0][1]))
            data_lsh[size].append({id: roads})

    with open('result_lsh.json', 'w') as f:
        f.write(str(data_lsh))
Exemple #7
0
def create_feature(list_author, net):
    global example_image_dir
    list_feature = list()
    image_paths = list()
    ## Locality Sensitive Hashing
    k = 10  # hash size
    L = 5  # number of tables
    d = 58  # Dimension of Feature vector
    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)
    for subfolder in list_author.keys():
        subfolder_path = os.path.join(example_image_dir, subfolder)
        count_items = len([
            name for name in os.listdir(subfolder_path)
            if os.path.isfile(os.path.join(subfolder_path, name))
        ])
        # print(subfolder)
        sum_acc = 0
        sum_confiden = 0

        for img in os.listdir(subfolder_path):
            image_path = os.path.join(subfolder_path, img)
            author, confidence, feature = predict_author_single_img(
                net, image_path)
            image_paths.append(image_path)
            list_feature.append(feature)
            lsh.index(feature, extra_data=image_path)
    pickle.dump(lsh, open('lsh.p', "wb"))
    return lsh, image_paths, list_feature
Exemple #8
0
 def test_lshash_extra_val(self):
     lsh = LSHash(self.hash_size,
                  self.input_dim,
                  1,
                  storage_config={'dict': None})
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]), self.el_names[i])
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         for el in itm:
             self.assertIn(el[0], self.els)
             self.assertIn(el[1], self.el_names)
     for el in self.els:
         # res is a list, so we need to select the first entry only
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # vector an name are in the first element of the tuple res[0]
         el_v, el_name = res[0]
         # the distance is in the second element of the tuple
         el_dist = res[1]
         self.assertIn(el_v, self.els)
         self.assertIn(el_name, self.el_names)
         self.assertEqual(el_dist, 0)
     del lsh
Exemple #9
0
def knn(data_array, data, hash_size_input, data_shape):

    # init LSHash
    lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0])

    # index
    for col_index in range(data_shape[1]):
        lsh.index(data_array[:, col_index], extra_data=data.columns[col_index])

    # get a random pos
    vipno_pos = rd.randint(0, data_shape[1])

    # calculate and output
    for k in [1, 2, 3, 4, 5]:
        print 'hash size: %d' % hash_size_input
        print 'value k: %d' % k
        print 'target vipno: %d' % data.columns[vipno_pos]

        result = []
        for res in lsh.query(data_array[:, vipno_pos],
                             num_results=k + 1,
                             distance_func='euclidean'):
            result.append(res[0][1])

        print 'results: '
        print result[1:]
Exemple #10
0
def k_nn_lsh(k, word, decade_matrix, index_dict):
    index_dict = dict(map(reversed, index_dict.items()))
    num_rows = decade_matrix.get_shape()[0]
    lsh = LSHash(6, num_rows)
    for i in range(num_rows):
        print(i)
        lsh.index(decade_matrix.getrow(i).todense())
    return lsh.query(word)
Exemple #11
0
    def Mainfunc(self, mat_addr):
        np.set_printoptions(suppress=True, precision=6, threshold=8)
        s = sio.loadmat(mat_addr)
        svec = s['FFE']
        datalen = len(svec)
        n1, n2, n3 = np.shape(svec)
        data = np.zeros((n1, 87212))
        m = 0
        for i in range(n2):
            for j in range(n3):
                if svec[:, i, j].all() != 0:
                    data[:, m] = svec[:, i, j]
                    m = m + 1
        # print data[:,0]
        dataves = np.transpose(data)
        modelindex = list(set(np.random.randint(1, 87212, size=10000)))

        lsh_model = LSHash(7, n1)
        for jj in modelindex:
            lsh_model.index(dataves[jj, :])

        # if you want to test a program
        starttest = 1  # start test index
        endtest = 5
        testindex = random.sample(modelindex,
                                  1)  # SIZE IS THE NUMBER OF TEST FUNCTIONS

        test = np.zeros((len(testindex), n1))
        for i in range(len(testindex)):
            #  print dataves[testindex[i],:]
            test[i, :] = dataves[testindex[i], :]
        # print len(test)
        output = open('result.txt', 'w')
        timee = open('time.txt', 'w')
        for queryi in range(len(testindex)):
            if test[queryi, :].all() != 0:
                starttime = time.time()
                Atemp = lsh_model.query(test[queryi, :], 5, 'cosine')
                print(str(Atemp[0]).split(')')[0]).replace('(', '')
                output.write((str(Atemp[0]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[1]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[2]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[3]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[4]).split(')')[0]).replace('(', '') +
                             '\n')

                endtime = time.time()
                timee.write(str(endtime - starttime) + '\n')
                # output.write(A)
                output.write('\n')

        output.close()
        timee.close()
Exemple #12
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        lsh = LSHash(hash_size=32, input_dim=f, num_hashtables=100)
        for i in range(n):
            lsh.index(X[i], i)

        return lsh
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle):
    f = file(bits_tid_pickle, "rb")
    data = pickle.load(f)
    f.close()
    #这里的参数可以调整,具体见https://github.com/kayzh/LSHash
    lsh = LSHash(13, 128, num_hashtables=1)
    map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys())
    out = file(lsh_pickle, "wb")
    pickle.dump(lsh, out, -1)
    out.close()
Exemple #14
0
def lshSearch(dataBase2, test2, num):

    lsh = LSHash(30, 216)

    def CreateIndex(array):
        for item in array:
            lsh.index(item)
    CreateIndex(dataBase2)
    test2 = test2.reshape((216,))
    res = lsh.query(test2, num, distance_func='true_euclidean')
    return res
Exemple #15
0
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle):
    f = file(bits_tid_pickle, "rb")
    data = pickle.load(f)
    f.close()

    # '10' means the bit binary (github.com/kayzh/LSHash)
    lsh = LSHash(13, 10, num_hashtables=1)
    map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys())
    out = file(lsh_pickle, "wb")
    pickle.dump(lsh, out, -1)
    out.close()
Exemple #16
0
def get_lshash(filename):
    lsh = LSHash(30, 8)
    try:
        with open(filename) as f:
            content = f.readlines()
            content = [x.strip('\n') for x in content]
    except Exception as e:
        print("Cannot find the file.")

    for row in content:
        row = row.split(",")
        row = list(map(int, row))
        tmp = row[:8]
        lsh.index(tmp, str(row[8]))

    return lsh
Exemple #17
0
def generateSingleHash(X, planesFileName, n_bits=64):
    """
    Generate a n_bits long hash for each input in X
    :param X:
    :param n_bits:
    :return:
    """
    import utils

    # overwrite old matrixes an build some random new ones
    fileName = os.path.join(utils.lsh_planes_dir, planesFileName + '.npz')
    lsh = LSHash(n_bits,
                 np.shape(X)[0],
                 matrices_filename=fileName,
                 overwrite=False)

    return lsh._hash(lsh.uniform_planes[0], X.tolist())
Exemple #18
0
def lshTOfind(path):
    lsh = LSHash(50,361)
    f = open('newindex.csv')
    index = csv.reader(f)
    features = []
    count = 0
    for r in index:
        features = [int(float(i)) for i in r[1:]]
        lsh.index(features)
        count += 1
    try:
        f_v = getfeatures(path)
        ans = lsh.query(f_v)
        if ans != []:
            return searchid(int(ans[0][0][360]/10000))
    except:
        return []
Exemple #19
0
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k):

    # init LSHash
    lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0])

    # index
    for col_index in range(data_shape[1]):
        lsh.index(data_array[:, col_index], extra_data=data.columns[col_index])

    # calculate and output
    result = []
    for res in lsh.query(data_array[:, vipno_pos],
                         num_results=k + 1,
                         distance_func='euclidean'):
        result.append(res[0][1])

    return result[1:]
Exemple #20
0
def test_lshash():
    lsh = LSHash(6, 8, 1)
    for i in xrange(num_elements):
        lsh.index(list(els[i]))
        lsh.index(list(els[i]))  # multiple insertions
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        assert itms.count(itm) == 1
        for el in itm:
            assert el in els
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # res is a tuple containing the vector and the distance
        el_v, el_dist = res
        assert el_v in els
        assert el_dist == 0
    del lsh
def classify_nearest_neighbor_lsh(k):
    lsh = LSHash(3, 12)
    labels = load_labels()

    for genre, song_genres_ids in labels.groupby('category'):
        print('Indexing genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2)):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            for val in song.values:
                lsh.index(val, extra_data=genre)

    total_count = 0
    match_count = 0
    for genre, song_genres_ids in labels.groupby('category'):
        print('Expected genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2), num_values):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            genre_freqs = {}

            split_song = np.array_split(song, 5,
                                        axis=0)  # Split song into sections
            for s in split_song:
                avg_song_val = np.mean(s)  # Take average of each section
                neighbours = lsh.query(avg_song_val, num_results=k)
                for neighbour in neighbours:
                    genre = neighbour[0][1]
                    genre_freqs[genre] = genre_freqs.get(genre, 0) + 1

            actual_genre = max(genre_freqs, key=genre_freqs.get)
            print('Predicted genre: {}'.format(actual_genre))
            total_count += 1
            if genre == actual_genre:
                match_count += 1

    print('Matched {} out of {} songs: {}%'.format(
        match_count, total_count, (match_count / total_count) * 100))
def detect_subevent(filename):
    dictionaryFile = filename + ".dict"
    corpusFile = filename + ".mm"
    outputFile = filename + ".out"
    outputVector = []
    tempDict = {}
    outputdict = {}
    corpus = corpora.MmCorpus(corpusFile)
    dictionary = corpora.Dictionary.load(dictionaryFile)
    lsh = LSHash(30, dictionary.__len__())
    index = 0
    count = 0
    for index in range(len(corpus)):
        #print str(index)+",",
        #print corpus[index]
        denseVector = getDenseVector(corpus[index], lsh.input_dim)
        #print getSparseVector(denseVector)
        result = lsh.query(denseVector, num_results=5, distance_func="cosine")
        #print result
        #no similar tweets
        count += 1
        if (result == []):
            outputdict[index] = []
            tempDict[getSparseVector(denseVector)] = index
            lsh.index(denseVector)
            #continue

        else:
            for r in result:
                if (outputdict.has_key(tempDict[getSparseVector(r[0])])):
                    outputdict[tempDict[getSparseVector(r[0])]].append(index)
                    break
        #print count,

    #print outputdict
    with open(outputFile, 'w') as out:
        for key in outputdict.iterkeys():
            line = str(key)
            for i in outputdict[key]:
                line += ", " + str(i)
            out.write(line + "\n")

    print "Please check the output file:", outputFile
Exemple #23
0
def create_feature(train_image_dir, classes, net):
    list_feature = list()
    image_paths = list()
    ## Locality Sensitive Hashing
    k = 10 # hash size
    L = 5  # number of tables
    d = 58 # Dimension of Feature vector
    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)
    for each_object in classes:
        each_object_path = os.path.join(train_image_dir, each_object)
        list_img = next(os.walk(each_object_path))[2]
        print("hashing class: ", each_object, " which has: ", len(list_img))
        for img in list_img:
            image_path = os.path.join(each_object_path, img)
            feature = get_feature_single_img(net, image_path)
            image_paths.append(image_path)
            list_feature.append(feature)
            lsh.index(feature, extra_data=image_path)
    return lsh, image_paths, list_feature
Exemple #24
0
def test():
    import utils

    trueIds, testSet = utils.load_test_set('fc7', 'raw', 0)

    lsh = LSHash(128,
                 np.shape(testSet[0])[0],
                 matrices_filename='lsh_planes.data.npz',
                 overwrite=True)

    for idx, input_point in enumerate(testSet):
        hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist())
        print hastValue

        lsh.index(input_point, idx)

    print lsh.query(testSet[3], 3)

    return None
Exemple #25
0
 def test_lshash(self):
     lsh = LSHash(self.hash_size, self.input_dim, 1)
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]))
         lsh.index(list(self.els[i]))  # multiple insertions
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         self.assertEqual(itms.count(itm), 1)
         for el in itm:
             self.assertIn(el, self.els)
     for el in self.els:
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # res is a tuple containing the vector and the distance
         el_v, el_dist = res
         self.assertIn(el_v, self.els)
         self.assertEqual(el_dist, 0)
     del lsh
def index_room():
    '''
    lsh算法索引图片特征
    :return:
    '''
    files = glob("./data/features/*.csv")
    files_ids = [
        filename.split("\\")[-1].replace(".csv", "") for filename in files
    ]
    X = np.load("data/train.npy")
    X = X.reshape(X.shape[0], -1)
    encoder = load_model("data/encoder.h5")
    dimension = 100
    lsh_hash = LSHash(hash_size=32, input_dim=dimension)
    compress_feature = encoder.predict(X)
    for num, ele in enumerate(compress_feature.tolist()):
        lsh_hash.index(ele, extra_data=files_ids[num])
    with open("data/lsh.pkl", "wb") as fh:
        pickle.dump(lsh_hash, fh)
Exemple #27
0
def generateHashes(X, scalar, planesFileName, n_bits=64):
    """
    Generate a n_bits long hash for each input in X
    :param X:
    :param n_bits:
    :return:
    """
    import utils

    # overwrite old matrixes an build some random new ones
    fileName = os.path.join(utils.lsh_planes_dir, planesFileName + '.npz')
    lsh = LSHash(n_bits,
                 np.shape(X[0])[0],
                 matrices_filename=fileName,
                 overwrite=False)
    hashValues = []
    for input_point in X:
        input_point = scalar.transform(input_point)
        hashValues.append(lsh._hash(lsh.uniform_planes[0], input_point))

    return hashValues
Exemple #28
0
def test_lshash_extra_val():
    lsh = LSHash(6, 8, 1)
    for i in xrange(num_elements):
        lsh.index(list(els[i]), el_names[i])
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        for el in itm:
            assert el[0] in els
            assert el[1] in el_names
    for el in els:
        # res is a list, so we need to select the first entry only
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # vector an name are in the first element of the tuple res[0]
        el_v, el_name = res[0]
        # the distance is in the second element of the tuple
        el_dist = res[1]
        assert el_v in els
        assert el_name in el_names
        assert el_dist == 0
    del lsh
Exemple #29
0
def lshTOfind(path):
    lsh = LSHash(10, 360)
    f = open('copyindex.csv')
    index = csv.reader(f)
    features = []
    count = 0
    for r in index:
        features = [float(i) for i in r[1:]]
        lsh.index(features[:360], features[360])
        count += 1
    try:
        f_v = getfeatures(path)
        #print f_v
        ans = lsh.query(f_v[:360], 15)
        if ans != []:
            res = []
            for i in ans:
                res.append(int(i[0][1] / 10000))
            return res
        #searchid(int(ans[0][0][360]/10000))
    except:
        return []
Exemple #30
-1
def create_hash2img():
    img2gist = get_img2gist()
    lsh = LSHash(hash_len, 960, storage_config=redis_config,
                 matrices_filename=matrices_file)
    count = 0
    total_num = len(img2gist)
    for name, gist_v in img2gist.iteritems():
        count += 1
        lsh.index(gist_v, name)
        sys.stdout.write('%d/%d\r    ' % (count, total_num))
        sys.stdout.flush()

    print 'bucket ratio: %d/%d' % (len(lsh.hash_tables[0].keys()), 2 ** hash_len)
    return lsh