def test():
    import utils

    trueIds, testSet = utils.load_test_set('fc7', 'raw', 0)

    lsh = LSHash(128, np.shape(testSet[0])[0], matrices_filename='lsh_planes.data.npz', overwrite=True)

    for idx, input_point in enumerate(testSet):
        hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist())
        print hastValue

        lsh.index(input_point, idx)

    print lsh.query(testSet[3], 3)

    return None
Exemple #2
0
 def test_lshash_extra_val(self):
     lsh = LSHash(self.hash_size,
                  self.input_dim,
                  1,
                  storage_config={'dict': None})
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]), self.el_names[i])
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         for el in itm:
             self.assertIn(el[0], self.els)
             self.assertIn(el[1], self.el_names)
     for el in self.els:
         # res is a list, so we need to select the first entry only
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # vector an name are in the first element of the tuple res[0]
         el_v, el_name = res[0]
         # the distance is in the second element of the tuple
         el_dist = res[1]
         self.assertIn(el_v, self.els)
         self.assertIn(el_name, self.el_names)
         self.assertEqual(el_dist, 0)
     del lsh
def plot_similar_tats_idx(idx: int,
                          feature_dict: dict,
                          lsh_variable: LSHash,
                          n_items: int = 6,
                          distance_func: str = 'hamming') -> plt.Figure:
    """Takes an input index for the training set and plots the closest matching tattoos to that input
    tattoo.

    Args:
        idx : index to tattoo in the training set
        feature_dict : wraps both image locations and feature vectors at the output of the cnn
        before the final layer.
        lsh_variable : trained lsh model to query the input image
        n_items : number of items to return
        distance_func : The distance function. Currently it needs to be one of ("hamming",
        "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"). By default
        "hamming" will used.

    Returns:
        Matplotlib grid plot of the index image first and n other similar images.

    """
    response = lsh_variable.query(feature_dict[list(
        feature_dict.keys())[idx]].flatten(),
                                  num_results=n_items + 1,
                                  distance_func=distance_func)

    return plot_similar_tats_query(response,
                                   n_items=n_items + 1,
                                   distance_func=distance_func)
Exemple #4
0
def test_lshash_redis():
    """
    Test external lshash module
    """
    config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
    sr = StrictRedis(**config['redis'])
    sr.flushdb()

    lsh = LSHash(6, 8, 1, config)
    for i in xrange(num_elements):
        lsh.index(list(els[i]))
        lsh.index(list(els[i]))  # multiple insertions should be prevented by the library
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        for el in itm:
            assert itms.count(itm) == 1  # have multiple insertions been prevented?
            assert el in els
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        el_v, el_dist = res
        assert el_v in els
        assert el_dist == 0
    del lsh
    sr.flushdb()
Exemple #5
0
def test_lshash_redis_extra_val():
    """
    Test external lshash module
    """
    config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
    sr = StrictRedis(**config['redis'])
    sr.flushdb()

    lsh = LSHash(6, 8, 1, config)
    for i in xrange(num_elements):
        lsh.index(list(els[i]), el_names[i])
        lsh.index(list(els[i]), el_names[i])  # multiple insertions
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        assert itms.count(itm) == 1
        for el in itm:
            assert el[0] in els
            assert el[1] in el_names
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # vector an name are in the first element of the tuple res[0]
        el_v, el_name = res[0]
        # the distance is in the second element of the tuple
        el_dist = res[1]
        assert el_v in els
        assert el_name in el_names
        assert el_dist == 0
    del lsh
    sr.flushdb()
Exemple #6
0
def knn(data_array, data, hash_size_input, data_shape):

    # init LSHash
    lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0])

    # index
    for col_index in range(data_shape[1]):
        lsh.index(data_array[:, col_index], extra_data=data.columns[col_index])

    # get a random pos
    vipno_pos = rd.randint(0, data_shape[1])

    # calculate and output
    for k in [1, 2, 3, 4, 5]:
        print 'hash size: %d' % hash_size_input
        print 'value k: %d' % k
        print 'target vipno: %d' % data.columns[vipno_pos]

        result = []
        for res in lsh.query(data_array[:, vipno_pos],
                             num_results=k + 1,
                             distance_func='euclidean'):
            result.append(res[0][1])

        print 'results: '
        print result[1:]
Exemple #7
0
 def test_lshash_redis_extra_val(self):
     """
     Test external lshash module
     """
     config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
     lsh = LSHash(self.hash_size, self.input_dim, 1, config)
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]), self.el_names[i])
         lsh.index(list(self.els[i]),
                   self.el_names[i])  # multiple insertions
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         assert itms.count(itm) == 1
         for el in itm:
             assert el[0] in self.els
             assert el[1] in self.el_names
     for el in self.els:
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # vector an name are in the first element of the tuple res[0]
         el_v, el_name = res[0]
         # the distance is in the second element of the tuple
         el_dist = res[1]
         assert el_v in self.els
         assert el_name in self.el_names
         assert el_dist == 0
     del lsh
def traceLSHash(queryName, hashSize):
    #queryName ="hamming_query_12_3"
    #需要进行hashQuery的轨迹index
    indexList = [14, 249, 479, 689, 899]

    XYMatrix = DateTransform()

    resultList = []
    nearList = []

    lsh = LSHash(hashSize, 44107)
    tid = 1

    for traceList in XYMatrix:
        lsh.index(input_point=traceList, extra_data=tid)
        tid += 1

    resultFile = open(queryName + '.txt', 'w')

    for index in indexList:
        queryList = lsh.query(XYMatrix[index], distance_func="hamming")
        for result in queryList:
            resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str(
                result[1]) + "\n"
            nearList.append(result[0][1])
            resultFile.write(resultStr)
        resultList.append(nearList)
        nearList = []

    resultFile.close()

    writeHTML(resultList, queryName, "hashQuerry")
    print resultList
Exemple #9
0
    def test_lshash_redis(self):
        """
        Test external lshash module
        """
        config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
        lsh = LSHash(self.hash_size, self.input_dim, 1, config)
        for i in range(self.nb_elements):
            lsh.index(list(self.els[i]))
            lsh.index(
                list(self.els[i])
            )  # multiple insertions should be prevented by the library

        hasht = lsh.hash_tables[0]
        itms = [hasht.get_list(k) for k in hasht.keys()]

        for itm in itms:
            for el in itm:
                assert itms.count(
                    itm) == 1  # have multiple insertions been prevented?
                assert el in self.els

        for el in self.els:
            res = lsh.query(list(el), num_results=1,
                            distance_func='euclidean')[0]
            el_v, el_dist = res
            assert el_v in self.els
            assert el_dist == 0
        del lsh
Exemple #10
0
def write_json_lsh(hash_size, grid):
    '''
    将生成的lsh路径放入json并存储
    :param hash_size: hash size列表
    :param grid: 处理完的栅格数组
    :return: none
    '''
    data_lsh = {}
    for size in hash_size:
        print size
        print 'list'
        data_lsh[size] = []
        lsh = LSHash(size, 44107)
        count = 0
        for line in grid:
            lsh.index(line, extra_data=count)
            count += 1
        for id in road_id:
            roads = []
            res = lsh.query(grid[id])
            print len(res)
            for r in res:
                roads.append(pack_data(r[0][1]))
            data_lsh[size].append({id: roads})

    with open('result_lsh.json', 'w') as f:
        f.write(str(data_lsh))
Exemple #11
0
def k_nn_lsh(k, word, decade_matrix, index_dict):
    index_dict = dict(map(reversed, index_dict.items()))
    num_rows = decade_matrix.get_shape()[0]
    lsh = LSHash(6, num_rows)
    for i in range(num_rows):
        print(i)
        lsh.index(decade_matrix.getrow(i).todense())
    return lsh.query(word)
def eventIdentification(dictionaryFile, corpusFile, outputFile):
	outputVector = []
	tempDict = {}
	
	corpus = corpora.MmCorpus(corpusFile)
	dictionary = corpora.Dictionary.load(dictionaryFile)
	#print "Unique Tokens:", dictionary.__len__()
	lsh = LSHash(20, dictionary.__len__())
	index = 0
	for index in range(len(corpus)):
		denseVector = getDenseVector(corpus[index], lsh.input_dim)
		result = lsh.query(denseVector)
		
		#print denseVector
		
		#no similar tweets
		if(result == []):
			#print "No Similar Tweets for: ", index
			tempDict[tuple(denseVector)] = len(outputVector)
			outputVector.append([index])
			lsh.index(denseVector)
			continue
		
		assigned = False
		for vector in result:
			if(getDistance(vector, denseVector) == True):
				ev = tempDict[tuple(vector[0])]
				outputVector[ev].append(index)
				tempDict[tuple(denseVector)] = ev
				#for ind in range(len(outputVector)):
					#done = False
					#for tweetNo in outputVector[ind]:
						#if (tweetNo == tempDict[tuple(vector[0])]):
							#outputVector[ind].append(index)
							#done = True
							#break
					#if done == True:
						#break
				assigned = True
				break
		
		if assigned == False:
			tempDict[tuple(denseVector)] = len(outputVector)
			outputVector.append([index])
			
		lsh.index(denseVector)
		
		
	with open(outputFile, 'w') as out:
		for vector in outputVector:
			line = ""
			for index in vector:
				line += "," + str(index)
			out.write(line[1:]+"\n")
	
	del outputVector
	del tempDict
Exemple #13
0
    def Mainfunc(self, mat_addr):
        np.set_printoptions(suppress=True, precision=6, threshold=8)
        s = sio.loadmat(mat_addr)
        svec = s['FFE']
        datalen = len(svec)
        n1, n2, n3 = np.shape(svec)
        data = np.zeros((n1, 87212))
        m = 0
        for i in range(n2):
            for j in range(n3):
                if svec[:, i, j].all() != 0:
                    data[:, m] = svec[:, i, j]
                    m = m + 1
        # print data[:,0]
        dataves = np.transpose(data)
        modelindex = list(set(np.random.randint(1, 87212, size=10000)))

        lsh_model = LSHash(7, n1)
        for jj in modelindex:
            lsh_model.index(dataves[jj, :])

        # if you want to test a program
        starttest = 1  # start test index
        endtest = 5
        testindex = random.sample(modelindex,
                                  1)  # SIZE IS THE NUMBER OF TEST FUNCTIONS

        test = np.zeros((len(testindex), n1))
        for i in range(len(testindex)):
            #  print dataves[testindex[i],:]
            test[i, :] = dataves[testindex[i], :]
        # print len(test)
        output = open('result.txt', 'w')
        timee = open('time.txt', 'w')
        for queryi in range(len(testindex)):
            if test[queryi, :].all() != 0:
                starttime = time.time()
                Atemp = lsh_model.query(test[queryi, :], 5, 'cosine')
                print(str(Atemp[0]).split(')')[0]).replace('(', '')
                output.write((str(Atemp[0]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[1]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[2]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[3]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[4]).split(')')[0]).replace('(', '') +
                             '\n')

                endtime = time.time()
                timee.write(str(endtime - starttime) + '\n')
                # output.write(A)
                output.write('\n')

        output.close()
        timee.close()
Exemple #14
0
def test():
    import utils

    trueIds, testSet = utils.load_test_set('fc7', 'raw', 0)

    lsh = LSHash(128,
                 np.shape(testSet[0])[0],
                 matrices_filename='lsh_planes.data.npz',
                 overwrite=True)

    for idx, input_point in enumerate(testSet):
        hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist())
        print hastValue

        lsh.index(input_point, idx)

    print lsh.query(testSet[3], 3)

    return None
Exemple #15
0
def lshSearch(dataBase2, test2, num):

    lsh = LSHash(30, 216)

    def CreateIndex(array):
        for item in array:
            lsh.index(item)
    CreateIndex(dataBase2)
    test2 = test2.reshape((216,))
    res = lsh.query(test2, num, distance_func='true_euclidean')
    return res
Exemple #16
0
def lshTOfind(path):
    lsh = LSHash(50,361)
    f = open('newindex.csv')
    index = csv.reader(f)
    features = []
    count = 0
    for r in index:
        features = [int(float(i)) for i in r[1:]]
        lsh.index(features)
        count += 1
    try:
        f_v = getfeatures(path)
        ans = lsh.query(f_v)
        if ans != []:
            return searchid(int(ans[0][0][360]/10000))
    except:
        return []
Exemple #17
0
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k):

    # init LSHash
    lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0])

    # index
    for col_index in range(data_shape[1]):
        lsh.index(data_array[:, col_index], extra_data=data.columns[col_index])

    # calculate and output
    result = []
    for res in lsh.query(data_array[:, vipno_pos],
                         num_results=k + 1,
                         distance_func='euclidean'):
        result.append(res[0][1])

    return result[1:]
Exemple #18
0
class LshIndexer(Indexer):
	PARAMETERS = {'hash_size': 6,
				  'input_dim': 128,
				  'num_of_hashtables': 1,
				  'storage': {'redis': {'host':'localhost', 'port': 6379}}}

	def initialize_store(self, parameters):
		self.store = LSHash(parameters['hash_size'],
							parameters['input_dim'],
							parameters['num_of_hashtables'],
							parameters['storage'])

	def index(self, features):
		for feature in features:
			self.store.index(feature.data, feature.file_id)

	def query(self, feature, num_results=5):
		return self.store.query(feature, num_results)
Exemple #19
0
def test_lshash():
    lsh = LSHash(6, 8, 1)
    for i in xrange(num_elements):
        lsh.index(list(els[i]))
        lsh.index(list(els[i]))  # multiple insertions
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        assert itms.count(itm) == 1
        for el in itm:
            assert el in els
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # res is a tuple containing the vector and the distance
        el_v, el_dist = res
        assert el_v in els
        assert el_dist == 0
    del lsh
def classify_nearest_neighbor_lsh(k):
    lsh = LSHash(3, 12)
    labels = load_labels()

    for genre, song_genres_ids in labels.groupby('category'):
        print('Indexing genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2)):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            for val in song.values:
                lsh.index(val, extra_data=genre)

    total_count = 0
    match_count = 0
    for genre, song_genres_ids in labels.groupby('category'):
        print('Expected genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2), num_values):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            genre_freqs = {}

            split_song = np.array_split(song, 5,
                                        axis=0)  # Split song into sections
            for s in split_song:
                avg_song_val = np.mean(s)  # Take average of each section
                neighbours = lsh.query(avg_song_val, num_results=k)
                for neighbour in neighbours:
                    genre = neighbour[0][1]
                    genre_freqs[genre] = genre_freqs.get(genre, 0) + 1

            actual_genre = max(genre_freqs, key=genre_freqs.get)
            print('Predicted genre: {}'.format(actual_genre))
            total_count += 1
            if genre == actual_genre:
                match_count += 1

    print('Matched {} out of {} songs: {}%'.format(
        match_count, total_count, (match_count / total_count) * 100))
def detect_subevent(filename):
	dictionaryFile = filename + ".dict"
	corpusFile = filename + ".mm"
	outputFile = filename + ".out"
	outputVector = []
	tempDict = {}
	outputdict={}
	corpus = corpora.MmCorpus(corpusFile)
	dictionary = corpora.Dictionary.load(dictionaryFile)
	lsh = LSHash(30, dictionary.__len__())
	index = 0
	for index in range(len(corpus)):
		#print str(index)+",",
		#print corpus[index]
		denseVector = getDenseVector(corpus[index], lsh.input_dim)
		#print getSparseVector(denseVector)
		result = lsh.query(denseVector, num_results = 50, distance_func = "euclidean")
		#print result
		#no similar tweets
		
		if(result == []):
			outputdict[index]=[]
			tempDict[getSparseVector(denseVector)] = index
			lsh.index(denseVector)
			#continue
		
		else:
			for r in result:
				if(outputdict.has_key(tempDict[getSparseVector(r[0])])):
					outputdict[tempDict[getSparseVector(r[0])]].append(index)
					break
			
		
		
	#print outputdict
	with open(outputFile, 'w') as out:
		for key in outputdict.iterkeys():
			line = str(key) 
			for i in outputdict[key]:
				line += ", " + str(i)
			out.write(line+"\n")
	
	print "Please check the output file:", outputFile
def detect_subevent(filename):
    dictionaryFile = filename + ".dict"
    corpusFile = filename + ".mm"
    outputFile = filename + ".out"
    outputVector = []
    tempDict = {}
    outputdict = {}
    corpus = corpora.MmCorpus(corpusFile)
    dictionary = corpora.Dictionary.load(dictionaryFile)
    lsh = LSHash(30, dictionary.__len__())
    index = 0
    count = 0
    for index in range(len(corpus)):
        #print str(index)+",",
        #print corpus[index]
        denseVector = getDenseVector(corpus[index], lsh.input_dim)
        #print getSparseVector(denseVector)
        result = lsh.query(denseVector, num_results=5, distance_func="cosine")
        #print result
        #no similar tweets
        count += 1
        if (result == []):
            outputdict[index] = []
            tempDict[getSparseVector(denseVector)] = index
            lsh.index(denseVector)
            #continue

        else:
            for r in result:
                if (outputdict.has_key(tempDict[getSparseVector(r[0])])):
                    outputdict[tempDict[getSparseVector(r[0])]].append(index)
                    break
        #print count,

    #print outputdict
    with open(outputFile, 'w') as out:
        for key in outputdict.iterkeys():
            line = str(key)
            for i in outputdict[key]:
                line += ", " + str(i)
            out.write(line + "\n")

    print "Please check the output file:", outputFile
Exemple #23
0
 def test_lshash(self):
     lsh = LSHash(self.hash_size, self.input_dim, 1)
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]))
         lsh.index(list(self.els[i]))  # multiple insertions
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         self.assertEqual(itms.count(itm), 1)
         for el in itm:
             self.assertIn(el, self.els)
     for el in self.els:
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # res is a tuple containing the vector and the distance
         el_v, el_dist = res
         self.assertIn(el_v, self.els)
         self.assertEqual(el_dist, 0)
     del lsh
Exemple #24
0
def test_lshash_extra_val():
    lsh = LSHash(6, 8, 1)
    for i in xrange(num_elements):
        lsh.index(list(els[i]), el_names[i])
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        for el in itm:
            assert el[0] in els
            assert el[1] in el_names
    for el in els:
        # res is a list, so we need to select the first entry only
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # vector an name are in the first element of the tuple res[0]
        el_v, el_name = res[0]
        # the distance is in the second element of the tuple
        el_dist = res[1]
        assert el_v in els
        assert el_name in el_names
        assert el_dist == 0
    del lsh
def subEventDetection(dictionaryFile, corpusFile, outputFile):
	outputVector = []
	tempDict = {}
	corpus = corpora.MmCorpus(corpusFile)
	dictionary = corpora.Dictionary.load(dictionaryFile)
	lsh = LSHash(30, dictionary.__len__())
	index = 0
	for index in range(len(corpus)):
		denseVector = getDenseVector(corpus[index], lsh.input_dim)
		result = lsh.query(denseVector, num_results = 50, distance_func = "cosine")
		#no similar tweets
		if(result == []):
			outputVector.append([index])
			continue
		assigned = False
		for vector in result:
			if(getDistance(vector, denseVector) == True):
				for ind in range(len(outputVector)):
					done = False
					for tweetNo in outputVector[ind]:
						if (tweetNo == tempDict[vector]):
							outputVector[ind].append(index)
							done = True
							break
					if done == True:
						break
				assigned = True
				break
		if assiged == False:
			outputVector.append([index])
		lsh.index(denseVector)
		tempDict[tuple(denseVector)] = index
	with open(outputFile, 'w') as out:
		for vector in outputVector:
			line = ""
			for index in vector:
				line += ", " + str(index)
			out.write(line[2:]+"\n")
	print "Please check the output file:", outputFile
Exemple #26
0
def lshTOfind(path):
    lsh = LSHash(10, 360)
    f = open('copyindex.csv')
    index = csv.reader(f)
    features = []
    count = 0
    for r in index:
        features = [float(i) for i in r[1:]]
        lsh.index(features[:360], features[360])
        count += 1
    try:
        f_v = getfeatures(path)
        #print f_v
        ans = lsh.query(f_v[:360], 15)
        if ans != []:
            res = []
            for i in ans:
                res.append(int(i[0][1] / 10000))
            return res
        #searchid(int(ans[0][0][360]/10000))
    except:
        return []
class Searcher:

    _DIST_FUNCTIONS = ["hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"]
    index = None

    def __init__(self, dataset):
        self.create_index(dataset)

    def create_index(self, items, hash_size=6):
        input_dim = len(items.values()[0])
        self.index = LSHash(hash_size, input_dim)
        for key in items:
            self.index.index(items[key], extra_data=key)
        return True

    def query(self, query_item, num_results=10, distance_function='cosine'):
        if distance_function not in self._DIST_FUNCTIONS:
            raise Exception("{0} not supported".format(distance_function))
        results = self.index.query(query_item, num_results=num_results, distance_func=distance_function)
        return self.parse_results(results)

    def parse_results(self, results):
        return {x[0][1]:x[1] for x in results}
Exemple #28
0
def filterDataset(fileIn, fileOut, fileNodes, threshold):
    '''
    Reads filteredTaxiData.txt and filters out lines that are farther away from every node in OSM graph ,
    by a threshold value (0.1 mile). A data entry will be kept if the distance of point is less than this threshold,
    from any node in OSM graph
    '''
    # Dimension of our vector space
    lsh = LSHash(hash_size=10, input_dim=2)

    nodes = GetNodes(fileNodes)
    for node in nodes:
        v = np.array(node, dtype=float)
        lsh.index(v)
    bunch = []
    bunch_size = 5000
    count_lines_read = 0
    count_lines_written = 0
    with open(fileIn, "r") as fin, open(fileOut, "w") as fout:
        for line in fin:
            [latitude, longitude] = dataToGraph.lineToPoint(line)
            query = np.array((latitude, longitude), dtype=float)
            result = lsh.query(query, num_results=1)
            closest_node = result[0][0]
            count_lines_read += 1
            if vin((latitude, longitude), closest_node).miles < threshold:
                line = replacePointByOSMnode(line, closest_node)
                bunch.append(line)
                if len(bunch) == bunch_size:
                    fout.writelines(bunch)
                    count_lines_written += len(bunch)
                    bunch = []
                    if (count_lines_written % 10 == 0):
                        print("%d written / %d read" %
                              (count_lines_written, count_lines_read))
        fout.writelines(bunch)
        count_lines_written += len(bunch)
        print("%d lines written" % count_lines_written)
def hash_item_pic_v1(pic_folder):
    """
    当前版本采用HardNet直接进行特征输出,并且对整张图作为特征区域进行特征向量输出,会在图片所在同级
    目录输出一个图片与hash值编码的映射文件

    这种方案出来的结果是如果图片有平移则特征向量会有差距

    :param pic_folder:  所有图片所在文件夹
    :return:    是否成功
    """
    try:
        # 计算所有图片的特征向量
        desc = HardNetDescriptor()
        print(colored("HardNet模型加载完成", color='blue'))
        # 使用LSH
        lsh = LSHash(16, 128)
        img_feature_vector = {}
        with open(pic_folder + '_item_hash.txt', 'w') as to_write:
            img_file_list = glob(os.path.join(pic_folder, '*_[0-9].jpg'))
            for m_img_file in tqdm(img_file_list, desc='训练中'):
                fv = desc.describle([
                    np.array(
                        Image.open(m_img_file).convert('L').resize((32, 32))),
                ])[0]
                img_feature_vector[m_img_file] = fv
                lsh.index(fv, extra_data=m_img_file)
            for m_img_file in tqdm(img_file_list, desc='输出中'):
                res = lsh.query(img_feature_vector[m_img_file],
                                distance_func='centred_euclidean')
                # 输出所有临近的图片
                print(m_img_file, '|'.join(map(lambda x: x[0][1], res)))
        pass
        return True
    except Exception as e:
        print(colored("错误:%s" % str(e), color='red'))
        return False
Exemple #30
0
def main(argv):
    parser = argparse.ArgumentParser(prog='INDEX')
    parser.add_argument('source', help='path to the source metadata file')
    parser.add_argument('--hash-size', help='Hash size.', type=int, default=10)
    parser.add_argument('--num-tables',
                        help='Number of tables.',
                        type=int,
                        default=5)
    parser.add_argument('--query-index',
                        help='Index to use for query.',
                        type=int,
                        default=0)

    args = parser.parse_args(argv[1:])

    # read in the data file
    data = pandas.read_csv(args.source, sep='\t')

    # params
    k = args.hash_size  # hash size
    L = args.num_tables  # number of tables
    d = len(data['features'][0].split(','))

    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)

    # indexing
    for i in range(0, len(data)):
        lsh.index(np.asarray(data['features'][i].split(',')).astype('float64'),
                  extra_data=data['filename'][i])

    # query a vector q_vec
    response = lsh.query(
        np.asarray(
            data['features'][args.query_index].split(',')).astype('float64'))

    pprint(response)
Exemple #31
0
class feature_comparer():
    def __init__(self, fea_dim, compare_thresh):
        self.lsh = LSHash(bit_num, fea_dim, compare_kernel_num)
        self.fv_dict = {}
        self.compare_thresh = compare_thresh

    def load(self, filename):
        f = open(filename, 'r')
        while (1):
            line = f.readline()
            if not line:
                break

            fv = line.split(':')[0]
            id = line.split(':')[1]
            self.fv_dict[fv] = id

            fv_array = []
            s = fv[1:-1].split(',')
            for i in range(0, len(s)):
                fv_array.append(float(s[i]))
            self.lsh.index(fv_array)

    def insert(self, feature, id):
        self.fv_dict[str(feature)[1:-1]] = str(id)
        self.lsh.index(feature)

    def match(self, feature):
        q = self.lsh.query(feature, distance_func='cosine')
        if len(q) == 0:
            return False, -1
        mindis = q[0][1]
        if mindis < self.compare_thresh:
            return True, self.fv_dict[str(q[0][0])[1:-1]]
        else:
            return False, -1
from __future__ import print_function
from __future__ import division

from scipy.spatial.distance import cosine
from tqdm import tqdm
import numpy
from lshash import LSHash
import time

start = time.time()
lsh = LSHash(8, 300)
sample_word_embeds = []
for i in tqdm(xrange(20000)):
    word_embed = numpy.random.rand(300)
    lsh.index(word_embed)

    if i % 500 == 0:
        sample_word_embeds.append(word_embed)

print("Indexing takes {} seconds".format(time.time() - start))

start = time.time()
for word_embed in sample_word_embeds:
    print('-' * 80)
    results = lsh.query(word_embed, num_results=None, distance_func='cosine')
    print("Num result: {}".format(len(results)))
    print('Nearest neighbor cosine distance:')
    print("    {} | {}".format(results[1][1], cosine(results[1][0], word_embed)))

print('Query takes average {} seconds'.format((time.time() - start) / len(sample_word_embeds)))
Exemple #33
0
def vectorize(string):
    vec = numpy.zeros(25, dtype=numpy.int)
    for i in range(len(string)):
        vec[i] = ord(string[i])
    #print vec
    return vec

def decode(vec):
    vec = [unichr(int(vec[i])) for i in range(len(vec))]
    s = ''
    s = s.encode('utf-8', 'ignore')
    for i in range(len(vec)):
	if(vec[i] != '\x00'):
	    s = s+vec[i]
    return s


lsh = LSHash(1, 25, storage_config={'dict':'9'}, matrices_filename = '../advs/THE FIVE ORANGE PIPS.npz')

f = open('../advs/THE FIVE ORANGE PIPS.tok')
tok = pickle.load(f)

for word in tok:
    lsh.index(vectorize(word))

res = lsh.query(vectorize('orang'), num_results = 3,  distance_func = 'l1norm')

print len(res)
print [decode(r[0]) for r in res]
Exemple #34
0
for i, sample in enumerate(samples[train_n:]):
    total = total + 1
    #rs = lsh.query(get_img(fn), num_results=1, distance_func="cosine") # test rate: 91.326531, 196 files took 52901.431 ms
    #rs = lsh.query(get_img(fn), num_results=1, distance_func="l1norm") # test rate: 91.326531, 196 files took 35271.345 ms
    #rs = lsh.query(get_img(fn), num_results=1, distance_func="euclidean") # test rate: 90.816327, 196 files took 24904.888 ms
    #rs = lsh.query(get_img(fn), num_results=1, distance_func="true_euclidean") # test rate: 89.795918, 196 files took 17713.646 ms
    #rs = lsh.query(get_img(fn), num_results=1, distance_func="centred_euclidean") # test rate: 52.040816, 196 files took 9000.577 ms

    
    # BLOCKS = 1, ORIENTATIONS = (8, 8, 3), DIMENSION = 57, test rate: 89.285714, 196 files took 9997.003 ms
    # BLOCKS = 2, ORIENTATIONS = (8, 8, 3), DIMENSION = 228, test rate: 91.326531, 196 files took 17227.878 ms
    # BLOCKS = 3, ORIENTATIONS = (8, 8, 3), DIMENSION = 513, test rate: 98.469388, 196 files took 64944.190 ms
    # BLOCKS = 4, ORIENTATIONS = (8, 8, 4), DIMENSION = 960, test rate: 95.408163, 196 files took 47667.006 ms
    # BLOCKS = 5, ORIENTATIONS = (8, 8, 3), DIMENSION = 1425, test rate: 93.367347, 196 files took 71029.642 ms
    #rs = lsh.query(leargist.color_gist(Image.open(fn), nblocks=BLOCKS, orientations=ORIENTATIONS), num_results=1, distance_func="l1norm") 
    rs = lsh.query(sample, num_results=1, distance_func=DISTANCE_FUNC) 
    if rs and rs[0][0][1] == responses[train_n:][i]:
        correct = correct + 1
#    if rs:
#        rs = [r[0][1] for r in rs]
#        try:
#            idx = rs.index(responses[train_n:][i])
#        except ValueError:
#            idx = -1
#        if idx != -1:
#            correct = correct + 1
    #else:
    #    print CHARS[rs[0][0][1]], " => ", CHARS[responses[train_n:][i]]

t2 = time.time()
print "test rate: %f, %d files took %0.3f ms" % (correct/float(total)*100, total, (t2 - t1) * 1000.0)
Exemple #35
0
#Bloom filter
from pybloom import BloomFilter
from random import randrange
import numpy.random as nprnd

#### How does it work for a range of numbers?
f = BloomFilter(capacity=10000, error_rate=0.001)
[f.add(x) for x in range(10000)]
sum = 0
for i in range(10000):
    #print(i in f)
    sum = sum + (i in f)
print("Accuracy for range of numbers",(sum/10000)*100)
#### How does it work for random numbers?
f = BloomFilter(capacity=10000, error_rate=0.001)
randomNumbers = nprnd.randint(10000000, size=10000)
[f.add(x) for x in randomNumbers]
sum = 0
for i in range(10000):
    #print(i in f)
    sum = sum + (i in f)
print("Accuracy for random numbers",(sum/10000)*100)


#Locally sensityve hash function
from lshash import LSHash
lsh = LSHash(6, 8)
lsh.index([1,2,3,4,5,6,7,8])
lsh.query([6.8], num_results=None, distance_func="euclidean")
Exemple #36
0
    def Mainfunc(self, mat_addr, base, result_folder, binary_file):
        # base数据的所有binary_func_name
        Total_binary_func = []  # binnary:funcution#
        #  np.set_printoptions(suppress=True, precision=6, threshold=8)
        s = sio.loadmat(mat_addr)
        svec = s['FFE']
        datalen = len(svec)
        n1, n2, n3 = np.shape(svec)
        test_dict = {
            'core': [0, 12],
            'curl': [48, 60],
            'libgmp': [60, 72],
            'busybox': [72, 84],
            'openssl': [84, 96],
            'sqlite': [96, 108]
        }
        compareDict = {
            'core_arm_o0': 4,
            'core_arm_o1': 5,
            'core_arm_o2': 6,
            'core_arm_o3': 7,
            'curl_arm_o0': 52,
            'curl_arm_o1': 53,
            'curl_arm_o2': 54,
            'curl_arm_o3': 55,
            'libgmp.so.10.3.2_arm_O0': 64,
            'libgmp.so.10.3.2_arm_O1': 65,
            'libgmp.so.10.3.2_arm_O2': 66,
            'libgmp.so.10.3.2_arm_O3': 67,
            'busybox_arm_o0': 72,
            'busybox_arm_o1': 73,
            'busybox_arm_o2': 74,
            'busybox_arm_o3': 75,
            'openssl_arm_o0': 84,
            'openssl_arm_o1': 85,
            'openssl_arm_o2': 86,
            'openssl_arm_o3': 87,
            'sqlite_arm_o0': 96,
            'sqlite_arm_o1': 97,
            'sqlite_arm_o2': 98,
            'sqlite_arm_o3': 99,
        }
        FUNCTIONNUMBER = {
            'coreutils_dir_X86_O0': 290,
            'coreutils_dir_X86_O1': 239,
            'coreutils_dir_X86_O2': 291,
            'coreutils_dir_X86_O3': 255,
            'coreutils_dir_arm_O0': 451,
            'coreutils_dir_arm_O1': 368,
            'coreutils_dir_arm_O2': 377,
            'coreutils_dir_arm_O3': 334,
            'coreutils_dir_mips_O0': 306,
            'coreutils_dir_mips_O1': 247,
            'coreutils_dir_mips_O2': 242,
            'coreutils_dir_mips_O3': 244,
            'coreutils_du_X86_O0': 237,
            'coreutils_du_X86_O1': 182,
            'coreutils_du_X86_O2': 211,
            'coreutils_du_X86_O3': 176,
            'coreutils_du_arm_O0': 529,
            'coreutils_du_arm_O1': 393,
            'coreutils_du_arm_O2': 387,
            'coreutils_du_arm_O3': 329,
            'coreutils_du_mips_O0': 401,
            'coreutils_du_mips_O1': 288,
            'coreutils_du_mips_O2': 273,
            'coreutils_du_mips_O3': 248,
            'coreutils_ls_X86_O0': 290,
            'coreutils_ls_X86_O1': 239,
            'coreutils_ls_X86_O2': 291,
            'coreutils_ls_X86_O3': 255,
            'coreutils_ls_arm_O0': 451,
            'coreutils_ls_arm_O1': 368,
            'coreutils_ls_arm_O2': 377,
            'coreutils_ls_arm_O3': 334,
            'coreutils_ls_mips_O0': 306,
            'coreutils_ls_mips_O1': 247,
            'coreutils_ls_mips_O2': 242,
            'coreutils_ls_mips_O3': 244,
            'coreutils_vdir_X86_O0': 290,
            'coreutils_vdir_X86_O1': 239,
            'coreutils_vdir_X86_O2': 291,
            'coreutils_vdir_X86_O3': 255,
            'coreutils_vdir_arm_O0': 451,
            'coreutils_vdir_arm_O1': 368,
            'coreutils_vdir_arm_O2': 377,
            'coreutils_vdir_arm_O3': 334,
            'coreutils_vdir_mips_O0': 306,
            'coreutils_vdir_mips_O1': 247,
            'coreutils_vdir_mips_O2': 242,
            'coreutils_vdir_mips_O3': 244,
            'curl_X86_O0': 128,
            'curl_X86_O1': 102,
            'curl_X86_O2': 152,
            'curl_X86_O3': 134,
            'curl_arm_O0': 263,
            'curl_arm_O1': 223,
            'curl_arm_O2': 213,
            'curl_arm_O3': 209,
            'curl_mips_O0': 130,
            'curl_mips_O1': 107,
            'curl_mips_O2': 169,
            'curl_mips_O3': 186,
            'libgmp.so.10.3.2_X86_O0': 621,
            'libgmp.so.10.3.2_X86_O1': 568,
            'libgmp.so.10.3.2_X86_O2': 591,
            'libgmp.so.10.3.2_X86_O3': 571,
            'libgmp.so.10.3.2_arm_O0': 971,
            'libgmp.so.10.3.2_arm_O1': 876,
            'libgmp.so.10.3.2_arm_O2': 854,
            'libgmp.so.10.3.2_arm_O3': 844,
            'libgmp.so.10.3.2_mips_O0': 606,
            'libgmp.so.10.3.2_mips_O1': 551,
            'libgmp.so.10.3.2_mips_O2': 545,
            'libgmp.so.10.3.2_mips_O3': 544,
            'busybox_arm_o0': 3216,
            'busybox_arm_o1': 2128,
            'busybox_arm_o2': 2099,
            'busybox_arm_o3': 1730,
            'busybox_mips_o0': 2900,
            'busybox_mips_o1': 2243,
            'busybox_mips_o2': 1726,
            'busybox_mips_o3': 1381,
            'busybox_x86_o0': 3196,
            'busybox_x86_o1': 2390,
            'busybox_x86_o2': 2542,
            'busybox_x86_o3': 2045,
            'openssl_arm_o0': 1778,
            'openssl_arm_o1': 1692,
            'openssl_arm_o2': 1675,
            'openssl_arm_o3': 1658,
            'openssl_mips_o0': 414,
            'openssl_mips_o1': 333,
            'openssl_mips_o2': 333,
            'openssl_mips_o3': 324,
            'openssl_x86_o0': 414,
            'openssl_x86_o1': 322,
            'openssl_x86_o2': 350,
            'openssl_x86_o3': 333,
            'sqlite_arm_o0': 2876,
            'sqlite_arm_o1': 2058,
            'sqlite_arm_o2': 1972,
            'sqlite_arm_o3': 1805,
            'sqlite_mips_o0': 2701,
            'sqlite_mips_o1': 1936,
            'sqlite_mips_o2': 1830,
            'sqlite_mips_o3': 1705,
            'sqlite_x86_o0': 2693,
            'sqlite_x86_o1': 1931,
            'sqlite_x86_o2': 1967,
            'sqlite_x86_o3': 1772,
        }

        FUNCTIONNAME = []
        func_name = open(binary_file, 'r')
        func_contents = func_name.readlines()
        for func_content in func_contents:
            FUNCTIONNAME.append(func_content.split("'")[1])

        # # 确认数据库偏移量
        # binary_db_num = []
        # for binary in FUNCTIONNAME:
        #     sql = "select * from " + self.table + " where binary_name=" + "'" + binary + "'"
        #     self.DODB.cursor.execute(sql)
        #     rows = self.DODB.cursor.fetchall()
        #     binary_db_num.append({binary:len(rows)})
        # print(binary_db_num)
        # exit()

        #core 只针对DIR
        imodel_name = 'openssl_arm_o3'
        imodel_BIN_name = 'openssl_arm_o3'
        imodel = compareDict[imodel_name]
        # 输入binanry全称
        imdel_s = self.GetSqlStart(FUNCTIONNUMBER, FUNCTIONNAME,
                                   imodel_BIN_name)
        # 确定数据库范围
        imodel_s_n = [imdel_s, FUNCTIONNUMBER[imodel_BIN_name]]

        itest_name = 'openssl_arm_o0'
        itest_BIN_name = 'openssl_arm_o0'
        itest = compareDict[itest_name]
        itest_s = self.GetSqlStart(FUNCTIONNUMBER, FUNCTIONNAME,
                                   itest_BIN_name)
        itest_s_n = [itest_s, FUNCTIONNUMBER[itest_BIN_name]]

        ######## 两两对比
        data = np.zeros((n1, 3500))
        test = np.zeros((n1, 3500))
        model_num = 0
        test_num = 0
        for j in range(n3):
            if svec[:, imodel, j].all() != 0:
                data[:, model_num] = svec[:, imodel, j]
                model_num = model_num + 1
            if svec[:, itest, j].all() != 0:
                test[:, test_num] = svec[:, itest, j]
                test_num = test_num + 1
        dataves = np.transpose(data)
        testves = np.transpose(test)
        #    output_total = open(result_folder + 'result_total.txt', 'w')

        model = np.zeros((model_num, n1))
        lsh_model = LSHash(7, n1)
        for jj in range(model_num):
            lsh_model.index(dataves[jj, :])
            model[jj, :] = dataves[jj, :]

        test = np.zeros((test_num, n1))
        for ii in range(test_num):
            test[ii, :] = testves[ii, :]

        ##############################################################################
        itest_func_list = self.GetFuncListFromFeature(test, itest_s_n[0],
                                                      itest_s_n[1])
        #imodel_func_list = self.GetFuncListFromFeature(model,imodel_s_n[0],imodel_s_n[1])
        print('target_list get success\n')

        # Inmodel_Total = self.GetInmodelTotal(imodel_func_list,itest_func_list)
        Inmodel_NUM = 0.0
        output = open(result_folder + 'BetweenTestRecored' + '.txt', 'a')
        # SelectDB = Date_Analysis()
        for queryi in range(test_num):
            key = 20
            test_funcname = itest_func_list[queryi]
            if test[queryi, :].all() != 0:
                Atemp = lsh_model.query(test[queryi, :], key, 'euclidean')
                for i in range(0, key):
                    if i < len(Atemp):
                        try:
                            feature_str = str(
                                Atemp[i]).split(')')[0].split('(')[2]
                            feature_list = feature_str.split(',')
                            feature_array = self.SelectDB.ListStr2ArrayFloat(
                                feature_list)
                            temp = self.SelectDB.DataAccuray(feature_array)
                            str_data = temp.astype(str)
                            feature = "-".join(str_data)
                            rows = self.SelectDB.DatafromFeature(
                                feature, imodel_s_n[0], imodel_s_n[1])
                            select_funcname = rows[0][1]
                            if test_funcname.find(select_funcname):
                                Inmodel_NUM = Inmodel_NUM + 1
                                print('Get One')
                                break
                            else:
                                pass
                        except Exception as e:
                            print(e)
                            print(str(Atemp[i]))
                    else:
                        print('AtempLen:', len(Atemp), ' ', 'key:', key)
                        break
        res = str(float('%.4f' % (Inmodel_NUM / len(itest_func_list))))
        msg = itest_name + '----->' + imodel_name + \
              ' Res:' + res + ' Inmodel_NUM:' + str(Inmodel_NUM) +\
            ' Test_NUM:' + str(len(itest_func_list)) #+ ' Model_NUM:' + str(len(imodel_func_list))# ' Inmodel_Total:' + str(Inmodel_Total)# +\
        output.write(msg + '\n')
        print(msg)
        output.close()
    img = Image.open(dataset+str(x)+fileext)
    pixel = np.array(img)
    #onedarray = pixel.ravel()
    hist,bins = np.histogram(pixel.ravel(),256,[0,256])
    listing=list(hist[0:255])
    big_array.append(listing)
    lsh.index(listing)

input_array=np.array(big_array)

img = Image.open(queryset+"10"+fileext)
pixel = np.array(img)
#onedarray = pixel.ravel()
hist,bins = np.histogram(pixel.ravel(),256,[0,256])
listing=list(hist[0:255])
k=lsh.query(listing,distance_func="l1norm")
vector = np.matrix(k)
length=len(k)
if length > 0:
    for output in range(length):
        if (k[output][1] < 800):
            test=np.array(k[output][0])
            result=np.where((input_array == test).all(axis=1))
            image_number.append(result[0][0]+1)
        
        #arr2 = np.asarray(k[output][0]).reshape(shape)
        #vector = np.matrix(np.uint8(arr2))
        #img2 = Image.fromarray(np.uint8(arr2),'RGB')
        #img2.show()
 #onedarray = pixel.ravel()
 #image_size.append[x]=onedarray
Exemple #38
0
        if value[:INPUT_DIMENSION] == list(search_item):
            return key
    return None


#Getting all tweets from the twitter_search3 library
temp_db_collection = db.twitter_search3.find().limit(490)
#Converting string to ASCII and storing in store_ascii_tweets.
for data in temp_db_collection:
    lsh.index(conversion_to_get_ascii(data, True))
    store_ascii_tweets.append(data)

#Formation of Groups
for data in store_ascii_tweets:
    if data["id"] not in temp:
        output = lsh.query(conversion_to_get_ascii(data, False))
        temp.add(data["id"])
        if len(output) > 0:
            for value in output:
                loc = None
                tweetid = back_to_string(value[0])
                temp.add(tweetid)
                tweetobj = dictionary_group[tweetid]
                if tweetobj["user"]["geo_enabled"] == True:
                    loc = tweetobj["place"]["name"]
                    groups[loc] = len(output)
                    break

    if (data["user"]["geo_enabled"] == True and data["place"] is not None):
        place = data["place"]["name"]
        t = 1
def main():

    parser = argparse.ArgumentParser(description = 'Tools for hamming distance-based image retrieval by cuda')
    parser.add_argument('-f', help = 'The filename of image raw features (SIFT).')
    parser.add_argument('-v', default = 'fvecs', help = 'The format of image raw features.')
    parser.add_argument('-s', default = 'dict', help = 'The method of indexing storage.')
    parser.add_argument('-d', default = '128', help = 'Dimensions of raw image feature.')
    parser.add_argument('-o', default = '0', help = 'Offset of accessing raw image features.')
    parser.add_argument('-n', default = '1', help = 'Number of raw image features to read.')
    parser.add_argument('-i', default = 'n', help = 'Whether to perform indexing step.')
    parser.add_argument('-e', help = 'The dirname of indexing folder.')
    parser.add_argument('-k', default = '10', help = 'Number of retrieved images.')
    parser.add_argument('-r', default = '32', help = 'Number of dimensions randomly sampled.')
    parser.add_argument('-c', default = 'n', help = 'Whether to perform compressing step.')
    parser.add_argument('-q', default = 'n', help = 'Whether to sequentially sampling.')
    parser.add_argument('-p', default = 'n', help = 'Whether to perform querying in compressed domain.')
    parser.add_argument('-g', default = 'y', help = 'GPU mode. default is "yes".')
    parser.add_argument('-l', default = 'n', help = 'VLQ base64 mode. Load VLQ base64 encoding compressed dict.')
    parser.add_argument('-b', default = '1', help = 'Expanding level of search buckets.')
    parser.add_argument('-t', default = 'int32', help = 'FastDict type (int32, int8, string).')
    parser.add_argument('-u', default = 'local', help = 'CUDA client type (local, net).')
    parser.add_argument('-host', default = 'localhost', help = 'CUDA server address.')
 

    args = parser.parse_args()

    d = int(args.d)
    nuse = int(args.n)
    off = int(args.o)
    random_dims = int(args.r)

    random_sampling = True
    if args.q == 'y':
        random_sampling = False

    lsh = LSHash(64, d, random_sampling, args.t, args.u, args.host, random_dims, 1, storage_config = args.s, matrices_filename = 'project_plane.npz')
    np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off, args.i)

    if args.c == 'y':
        if args.e != None and args.s == 'random':
            lsh.load_index(args.e)
            print "compressing index..."
            lsh.compress_index(args.e)
            print "compressing done."
        else:
            print "Please specify generated indexing file."
            sys.exit(0)

    if args.c != 'y' and args.i != 'y' and args.e != None and args.s == 'random':
        if args.p == 'y':
            print "loading compressed index."
            lsh.load_compress_index(args.e, (args.l == 'y'))
            print "loading done."
        else:
            print "loading index."
            lsh.load_index(args.e)
            print "loading done."
        if args.p != 'y':
            retrived = lsh.query(np_feature_vecs[1], num_results = int(args.k), expand_level = int(args.b), distance_func = 'hamming')
        else:
            retrived = lsh.query_in_compressed_domain(np_feature_vecs[1], num_results = int(args.k), expand_level = int(args.b), distance_func = 'hamming', gpu_mode = args.g, vlq_mode = args.l)
        print retrived
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 04 15:29:56 2015

@author: MaGesh
"""
import numpy as np
from scipy.ndimage import imread
from lshash import LSHash
lsh=LSHash(20,32*32) #32*32 is the dimension with 20 hash buckets
resultSet=[]
for i in range(1,100001):
    print i;
    X="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\dataset\\"+str(i)+".bmp"
    im=imread(X,flatten=True)
    single_array=im.flatten()
    lsh.index(single_array)#hashing the each values in to the bucket
for i in range(1,11):
    print i,"for querying"    
    X1="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\Query\\"+str(i)+".bmp"
    imQ=imread(X1,flatten=True) #converting to grey scale
    imFlatten=imQ.flatten()
    value=lsh.query(imFlatten,distance_func="euclidean") #querying the nearest points
    resultSet.append(value)
for u in range(6):
    lsh = LSHash(int(hash_size[u]), dimension)
    for i in range(1000):
        lsh.index(metric[i])
    for v in trade:
        f.write("Hash_size: ")
        f.write(str(int(hash_size[u])))
        f.write('\n')
        f.write("As for Tid ")
        f.write(str(v))
        f.write('\n')
        f.write('--------------------------------')
        f.write('\n')

        row = lsh.query(metric[v - 1], distance_func="hamming")
        for g in row:
            f.write("Index ")
            f.write(str(1 + (findByRow(metric, list(g[0])))[0]))
            f.write(" Distance is :")
            f.write(str(g[1]))
            f.write('\n')

f.write("Distance Func Euclidean")
f.write('\n')
f.write('\n')
f.write('############################')
f.write('\n')

for u in range(6):
Exemple #42
0
class LshManager(object):

    def __init__(self):
        self.lshIndexList = []


        # create a list of lsh indexes
        self.lsh = LSHash(NUMBER_OF_BITS_PER_HASH, NUM_TOPICS, num_hashtables=NUMBER_OF_LSH_INDEXES,
                          storage_config={"redis": {"host": "localhost", "port": 6379}})

    def clearIndex(self):
        redis.Redis().flushall()

    # adds a document to all lsh indexes
    def addDocument(self, document):
        lsa_vector = document.vectors["LSA"]

        dense_vector = self._sparseToDenseConverter(lsa_vector)

        if not hasattr(document, "timestamp"):
            document.timestamp = str(datetime.datetime.now())

        extra = json.dumps(str(document._id))

        # detect duplicates
        #result = self.lsh.query(dense_vector, num_results=1, distance_func="cosine")
        #if result:
        #    nearest = result[0]
        #    if nearest[1] > DUPLICATE_SIMILARITY_THRESHOLD:
        #        extra = ast.literal_eval(ast.literal_eval(nearest[0])[1])
        #        doctitle = getDatabaseConnection().holist.articles.find({"_id": extra}).next()["title"]
        #        ln.warn("Detected duplicate for %s (ID %s): %s.", document.title, document._id, extra)
        #        return

        self.lsh.index(dense_vector, extra_data=extra)  # extra MUST be hashable

    # takes a document and returns database ids of similar documents
    # uses cosine function to determine similarity
    def getSimilarDocuments(self, document, num_docs=7):
        if isinstance(document, Document):
            lsa_vector = document.vectors["LSA"]
        else:
            lsa_vector = document

        dense_vector = self._sparseToDenseConverter(lsa_vector)

        client = getDatabaseConnection()

        resultSet = set()
        results = []

        for result in self.lsh.query(dense_vector, num_results=num_docs, distance_func="cosine"):
            # example:
            # [
            #   (((1, 2, 3), "{'extra1':'data'}"), 0),
            #   (((1, 1, 3), "{'extra':'data'}"), 1)
            # ]
            extra = ast.literal_eval(ast.literal_eval(result[0])[1])

            clientDoc = bsonToClientBson(client.holist.articles.find({"_id": extra}).next())
            clientDoc['lsa'] = self._sparseToDenseConverter(clientDoc['lsa'])
            jsonstr = json.dumps(clientDoc)

            if not jsonstr in resultSet:
                resultSet.add(jsonstr)
                results.append(clientDoc)

        ln.debug("retrieved %s documents.", len(results))
        return results

    # converts a vector in sparse format to a vector in dense format
    def _sparseToDenseConverter(self, sparseVector):
        dense = {}
        for x in range(NUM_TOPICS):
            dense[x] = 0

        for dim, val in sparseVector:
            dense[dim] = val
        return [value for key, value in dense.items()]
Exemple #43
0
    return np.array(data).reshape(20, 20).T

limit = 10
data = loadmat('ex3data1.mat')
X, y = data['X'], data['y']
dim=X.shape[1]
hash_size = int(np.ceil(np.log2(len(y))))
print 'hash_size:', hash_size
lsh=LSHash(hash_size, dim)
#prepare
for x in X:
    lsh.index(x)
#test
for _ in range(10):
    i = np.random.randint(0,y.shape[0])
    res = lsh.query(X[i], distance_func = 'hamming')
    n = len(res)
    fig = plt.figure()
    fig.suptitle('y=%d, found: %d' %(y[i][0]%10, n))
    n = n>limit and limit or n
    ax = fig.add_subplot(2, n, 1)
    ax.set_axis_off()
    ax.imshow(to_img(X[i]))
    ax.set_title('original img')
    for k,j in enumerate(res[:n]):
        ax = fig.add_subplot(2, n, k+1+n)
        ax.set_axis_off()
        ax.imshow(to_img(j[0]))
        ax.set_title('distance: %.2f' %j[-1])
    plt.show()
Exemple #44
0
from lshash import LSHash
lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=1, storage_config={"lmdb": {'path': '/Users/christianburger/Downloads/testlmdb'}})
lsh.index([1,2,3,4,5,6,7,8], 'a')
lsh.index([2,3,4,5,6,7,8,9], 'b')
lsh.index([10,12,99,1,5,31,2,3], 'c')
print lsh.query([1,2,3,4,5,6,7,7])
Exemple #45
0
dim = len(Index) + 1 # -1 for excluded

data = xlrd.open_workbook(fname)
sht = data.sheet_by_name(shtname)
head = sht.row_values(0)
tweets = sht.col_values(head.index(target), start_rowx)

hash_size = int(np.ceil(np.log2(len(tweets))))
print 'hash_size: %d, dim: %d' %(hash_size, dim)
lsh=LSHash(hash_size, dim)

for tweet in tweets:
    x = spar.csr_matrix((1,dim) ,dtype=np.int8)
#    x = np.zeros(dim, np.bool8)
    ws = jieba.cut(tweet)
    try:
        for w in ws:
            x[Index.get(w, -1)] = 1
        lsh.index(x)
    except Exception, e:
        print e
        print tweet

sent = True
while sent:
    sent = raw_input('input sentence...\n')
    res = lsh.query(sent, distance_func = 'hamming')
    for i in res:
        print i[0], i[-1]

Exemple #46
0
from lshash import LSHash

"""
利用局部敏感hash查找近似值
"""

lsh = LSHash(6, 8)
lsh.index([1,2,3,4,5,6,7,8])
lsh.index([2,3,4,5,6,7,8,9])
lsh.index([10,12,99,1,5,31,2,3])
lsh.query([1,2,3,4,5,6,7,7])
Exemple #47
0
def main():

    parser = argparse.ArgumentParser(
        description='Tools for hamming distance-based image retrieval by cuda')
    parser.add_argument('-f',
                        help='The filename of image raw features (SIFT).')
    parser.add_argument('-v',
                        default='fvecs',
                        help='The format of image raw features.')
    parser.add_argument('-s',
                        default='dict',
                        help='The method of indexing storage.')
    parser.add_argument('-d',
                        default='128',
                        help='Dimensions of raw image feature.')
    parser.add_argument('-o',
                        default='0',
                        help='Offset of accessing raw image features.')
    parser.add_argument('-n',
                        default='1',
                        help='Number of raw image features to read.')
    parser.add_argument('-i',
                        default='n',
                        help='Whether to perform indexing step.')
    parser.add_argument('-e', help='The dirname of indexing folder.')
    parser.add_argument('-k', default='10', help='Number of retrieved images.')
    parser.add_argument('-r',
                        default='32',
                        help='Number of dimensions randomly sampled.')
    parser.add_argument('-c',
                        default='n',
                        help='Whether to perform compressing step.')
    parser.add_argument('-q',
                        default='n',
                        help='Whether to sequentially sampling.')
    parser.add_argument(
        '-p',
        default='n',
        help='Whether to perform querying in compressed domain.')
    parser.add_argument('-g', default='y', help='GPU mode. default is "yes".')
    parser.add_argument(
        '-l',
        default='n',
        help='VLQ base64 mode. Load VLQ base64 encoding compressed dict.')
    parser.add_argument('-b',
                        default='1',
                        help='Expanding level of search buckets.')
    parser.add_argument('-t',
                        default='int32',
                        help='FastDict type (int32, int8, string).')
    parser.add_argument('-u',
                        default='local',
                        help='CUDA client type (local, net).')
    parser.add_argument('-host',
                        default='localhost',
                        help='CUDA server address.')

    args = parser.parse_args()

    d = int(args.d)
    nuse = int(args.n)
    off = int(args.o)
    random_dims = int(args.r)

    random_sampling = True
    if args.q == 'y':
        random_sampling = False

    lsh = LSHash(64,
                 d,
                 random_sampling,
                 args.t,
                 args.u,
                 args.host,
                 random_dims,
                 1,
                 storage_config=args.s,
                 matrices_filename='project_plane.npz')
    np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off,
                                    args.i)

    if args.c == 'y':
        if args.e != None and args.s == 'random':
            lsh.load_index(args.e)
            print "compressing index..."
            lsh.compress_index(args.e)
            print "compressing done."
        else:
            print "Please specify generated indexing file."
            sys.exit(0)

    if args.c != 'y' and args.i != 'y' and args.e != None and args.s == 'random':
        if args.p == 'y':
            print "loading compressed index."
            lsh.load_compress_index(args.e, (args.l == 'y'))
            print "loading done."
        else:
            print "loading index."
            lsh.load_index(args.e)
            print "loading done."
        if args.p != 'y':
            retrived = lsh.query(np_feature_vecs[1],
                                 num_results=int(args.k),
                                 expand_level=int(args.b),
                                 distance_func='hamming')
        else:
            retrived = lsh.query_in_compressed_domain(np_feature_vecs[1],
                                                      num_results=int(args.k),
                                                      expand_level=int(args.b),
                                                      distance_func='hamming',
                                                      gpu_mode=args.g,
                                                      vlq_mode=args.l)
        print retrived
Exemple #48
0
    def Mainfunc(self, mat_addr, base, result_folder):
        # base数据的所有binary_func_name
        Total_binary_func = []  # binnary:funcution#
        SelectDB = Date_Analysis()
        #  np.set_printoptions(suppress=True, precision=6, threshold=8)
        s = sio.loadmat(mat_addr)
        svec = s['X']
        datalen = len(svec)
        n1, n2, n3 = np.shape(svec)
        #test_dict = {'core':[0,12],'curl':[48,60],'libgmp':[60,72],'busybox':[72,84],'openssl':[84,96],'sqlite':[96,108]}
        test_dict = {
            'busybox': [0, 12],
            'core': [12, 60],
            'curl': [60, 72],
            'libgmp': [72, 84],
            'openssl': [84, 96],
            'sqlite': [96, 108]
        }
        compareDict = {
            'core_arm_o0': 4,
            'core_arm_o1': 5,
            'core_arm_o2': 6,
            'core_arm_o3': 7,
            'curl_arm_o0': 52,
            'curl_arm_o1': 53,
            'curl_arm_o2': 54,
            'curl_arm_o3': 55,
            'libgmp.so.10.3.2_arm_O0': 64,
            'libgmp.so.10.3.2_arm_O1': 65,
            'libgmp.so.10.3.2_arm_O2': 66,
            'libgmp.so.10.3.2_arm_O3': 67,
            'busybox_arm_o0': 73,
            'busybox_arm_o1': 74,
            'busybox_arm_o2': 75,
            'busybox_arm_o3': 76,
            'openssl_arm_o0': 84,
            'openssl_arm_o1': 85,
            'openssl_arm_o2': 86,
            'openssl_arm_o3': 87,
            'sqlite_arm_o0': 96,
            'sqlite_arm_o1': 97,
            'sqlite_arm_o2': 98,
            'sqlite_arm_o3': 99,
            'core_x86_o0': 0,
            'core_x86_o1': 1,
            'core_x86_o2': 2,
            'core_x86_o3': 3,
            'curl_x86_o0': 48,
            'curl_x86_o1': 49,
            'curl_x86_o2': 50,
            'curl_x86_o3': 51,
            'libgmp.so.10.3.2_x86_O0': 60,
            'libgmp.so.10.3.2_x86_O1': 61,
            'libgmp.so.10.3.2_x86_O2': 62,
            'libgmp.so.10.3.2_x86_O3': 63,
            'busybox_x86_o0': 80,
            'busybox_x86_o1': 81,
            'busybox_x86_o2': 82,
            'busybox_x86_o3': 83,
            'openssl_x86_o0': 92,
            'openssl_x86_o1': 93,
            'openssl_x86_o2': 94,
            'openssl_x86_o3': 95,
            'sqlite_x86_o0': 104,
            'sqlite_x86_o1': 105,
            'sqlite_x86_o2': 106,
            'sqlite_x86_o3': 107,
        }
        #         FUNCTIONNUMBER={'coreutils_dir_X86_O0':290,'coreutils_dir_X86_O1':239,
        #                         'coreutils_dir_X86_O2':291,'coreutils_dir_X86_O3':255,
        #                         'coreutils_dir_arm_O0':451,'coreutils_dir_arm_O1':368,'coreutils_dir_arm_O3':334,'coreutilsr_mips_O0':306,
        #                         'coreutils_dir_mips_O1':247,'coreutils_dir_mips_O2':242,'coreutils_dir_mips_O3':244,'coreutils_du_X86_O0':237,'coreutils_du_X86_O1':182,
        # 'coreutils_du_X86_O2':211,'coreutils_du_X86_O3':176,'coreutils_du_arm_O0':529,'coreutils_du_arm_O1':393,
        # 'coreutils_du_arm_O2':387,
        # 'coreutils_du_arm_O3':329,
        # 'coreutils_du_mips_O0':401,
        # 'coreutils_du_mips_O1':288,
        # 'coreutils_du_mips_O2':273,
        # 'coreutils_du_mips_O3':248,
        # 'coreutils_ls_X86_O0':290,
        # 'coreutils_ls_X86_O1':239,
        # 'coreutils_ls_X86_O2':291,
        # 'coreutils_ls_X86_O3':255,
        # 'coreutils_ls_arm_O0':451,
        # 'coreutils_ls_arm_O1':368,
        # 'coreutils_ls_arm_O2':377,
        # 'coreutils_ls_arm_O3':334,
        # 'coreutils_ls_mips_O0':306,
        # 'coreutils_ls_mips_O1':247,
        # 'coreutils_ls_mips_O2':242,
        # 'coreutils_ls_mips_O3':244,
        # 'coreutils_vdir_X86_O0':290,
        # 'coreutils_vdir_X86_O1':239,
        # 'coreutils_vdir_X86_O2':291,
        # 'coreutils_vdir_X86_O3':255,
        # 'coreutils_vdir_arm_O0':451,
        # 'coreutils_vdir_arm_O1':368,
        # 'coreutils_vdir_arm_O2':377,
        # 'coreutils_vdir_arm_O3':334,
        # 'coreutils_vdir_mips_O0':306,
        # 'coreutils_vdir_mips_O1':247,
        # 'coreutils_vdir_mips_O2':242,
        # 'coreutils_vdir_mips_O3':244,
        # 'curl_X86_O0':128,
        # 'curl_X86_O1':102,
        # 'curl_X86_O2':152,
        # 'curl_X86_O3':134,
        # 'curl_arm_O0':263,
        # 'curl_arm_O1':223,
        # 'curl_arm_O2':213,
        # 'curl_arm_O3':209,
        # 'curl_mips_O0':130,
        # 'curl_mips_O1':107,
        # 'curl_mips_O2':169,
        # 'curl_mips_O3':186,
        # 'libgmp.so.10.3.2_X86_O0': 621,
        # 'libgmp.so.10.3.2_X86__O1': 568,
        # 'libgmp.so.10.3.2_X86__O2': 591,
        # 'libgmp.so.10.3.2_X86__O3': 571,
        # 'libgmp.so.10.3.2_arm_O0':971,
        # 'libgmp.so.10.3.2_arm_O1':876,
        # 'libgmp.so.10.3.2_arm_O2':854,
        # 'libgmp.so.10.3.2_arm_O3':844,
        # 'libgmp.so.10.3.2_mips_O0':606,
        # 'libgmp.so.10.3.2_mips_O1':551,
        # 'libgmp.so.10.3.2_mips_O2':545,
        # 'libgmp.so.10.3.2_mipsO3':544,
        # 'busybox_arm_o0':3216,
        # 'busybox_arm_o1':2128,
        # 'busybox_arm_o2':2099,
        # 'busybox_arm_o3':1730,
        # 'busybox_mips_o0':2900,
        # 'busybox_mips_o1':2243,
        # 'busybox_mips_o2':1726,
        # 'busybox_mips_o3':1381,
        # 'busybox_x86_o0':3196,
        # 'busybox_x86_o1':2390,
        # 'busybox_x86_o2':2542,
        # 'busybox_x86_o3':2045,
        # 'openssl_arm_o0':1778,
        # 'openssl_arm_o1':1692,
        # 'openssl_arm_o2':1675,
        # 'openssl_arm_o3':1658,
        # 'openssl_mips_o0':414,
        # 'openssl_mips_o1':333,
        # 'openssl_mips_o2':333,
        # 'openssl_mips_o3':324,
        # 'openssl_x86_o0':414,
        # 'openssl_x86_o1':322,
        # 'openssl_x86_o2':350,
        # 'openssl_x86_o3':333,
        # 'sqlite_arm_o0':2876,
        # 'sqlite_arm_o1':2058,
        # 'sqlite_arm_o2':1972,
        # 'sqlite_arm_o3':1805,
        # 'sqlite_mips_o0':2701,
        # 'sqlite_mips_o1':1936,
        # 'sqlite_mips_o2':1830,
        # 'sqlite_mips_o3':1705,
        # 'sqlite_x86_o0':2693,
        # 'sqlite_x86_o1':1931,
        # 'sqlite_x86_o2':1967,
        # 'sqlite_x86_o3':1772,
        #                         }

        data = np.zeros((n1, 30000))
        test = np.zeros((n1, 3500))
        m = 0
        imodel = compareDict['core_arm_o0']
        itest = compareDict['core_arm_o3']
        for i in range(test_dict['core'][0], test_dict['core'][1]):
            for j in range(n3):
                if svec[:, i, j].all() != 0:
                    data[:, m] = svec[:, i, j]
                    m = m + 1

        dataves = np.transpose(data)

        #testves = np.transpose(test)

        ######## 两对比  两
        # for j in range(n3):
        #     if svec[:, imodel, j].all() != 0:
        #         data[:, m] = svec[:, imodel, j]
        #         m = m + 1
        #     if svec[:, itest, j].all() != 0:
        #         test[:, mm] = svec[:, itest, j]
        #         mm = mm + 1
        #dataves = np.transpose(data)
        #testves=np.transpose(test)
        # modelindex = list(set(np.random.randint(0, m, size=10000)))

        #    output_total = open(result_folder + 'result_total.txt', 'w')

        lsh_model = LSHash(7, n1)

        for jj in range(m):
            # for jj in range(87212):
            lsh_model.index(dataves[jj, :])

        testindex = list(set(np.random.randint(
            0, m, size=base)))  # SIZE IS THE NUMBER OF TEST FUNCTIONS

        test = np.zeros((len(testindex), n1))

        for i in range(len(testindex)):
            test[i, :] = dataves[testindex[i], :]
        # output = open(result_folder + 'result_key' + str(key) + '_base' + str(base) + '.txt', 'w')


#        testindex=mm
##############################################################################

        timee = open(result_folder + 'coreutils_time.txt', 'a')
        target_list = []
        M_list = []

        for queryi in range(len(testindex)):
            target = test[queryi, :]
            temp_target = SelectDB.DataAccuray(target)
            str_target = temp_target.astype(str)
            feature_target = "-".join(str_target)
            rows = SelectDB.DatafromFeature(feature_target)
            target_data = self.Row2Str(rows)
            target_list.append(target_data)

            Global_M = self.GetGlobalM(rows[0][1])
            M_list.append(Global_M)

        print('Global_M get success\n')

        Totaltime = 0.0
        # SelectDB = Date_Analysis()
        for queryi in range(len(testindex)):
            flag_over = 0
            keylist = [i for i in range(1, 10001, 5)]
            #keylist=[5]
            target_data = target_list[queryi].split('#')[0]
            output = open(result_folder + 'coreutils_result_base' + str(base) + \
                          '_No' + str(queryi) + '.txt', 'w')
            output.write('Target:' + target_data + '\n')
            print(target_data + '\n')
            for key in keylist:
                if flag_over == 0:
                    msg = 'Key:' + str(key) + ' Base:' + str(base) + \
                          ' No:' + str(queryi) + ' M:' + str(M_list[queryi])
                    print(msg + '\n')
                    output.write(msg + '\n')
                    if test[queryi, :].all() != 0:
                        starttime = time.time()
                        Atemp = lsh_model.query(test[queryi, :], key,
                                                'euclidean')
                        endtime = time.time()
                        Totaltime = Totaltime + endtime - starttime

                        for i in range(0, key):
                            if i < len(Atemp):
                                try:
                                    flag_over = 0
                                    feature_str = str(
                                        Atemp[i]).split(')')[0].split('(')[2]
                                    feature_list = feature_str.split(',')
                                    feature_array = SelectDB.ListStr2ArrayFloat(
                                        feature_list)
                                    temp = SelectDB.DataAccuray(feature_array)
                                    str_data = temp.astype(str)
                                    feature = "-".join(str_data)
                                    rows = SelectDB.DatafromFeature(feature)
                                    select_data = self.Row2Str(rows)
                                except Exception as e:
                                    print(e)
                                    print(str(Atemp[i]))
                                    select_data = 'null:null#'
                            else:
                                print('AtempLen:', len(Atemp), ' ', 'key:',
                                      key, '\n')
                                select_data = 'null:null#'
                                flag_over = 1
                            output.write(select_data + '\n')
                            print(select_data + '\n')
                else:
                    break

                msg = 'Key:' + str(key) + ' Base:' + str(base) + \
                      ' No:' + str(queryi) + ' Time:' + str(float(Totaltime/base)) + '\n'
                timee.write(msg)
                print(msg)
            output.close()
        timee.close()
    for note,name in note_from_midi(mid18):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(mid17):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(mid19):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(mid20):
        lsh.index(note,extra_data=(name,0.8))

    kk = []
    i = 0
    result = {}
    for note,name in nlsh('xml.wav'):
        q = note
        kk.extend(q)
        r = lsh.query(q)
        print '--------'+str(i)+'-----------'
        i += 1
        if(len(r) > 0):
            print len(r)
            # keep 3 candidate
            nn = min(3,len(r))
            # let's vote(based on distance)
            for k in range(nn):
                w = r[k][1]
                name = r[k][0][1][0]
                if(not result.has_key(name)):
                    result[name] = 0.0
                else:
                    w *= 0.93
                result[name] += w