Exemple #1
0
    def test_lshash_redis(self):
        """
        Test external lshash module
        """
        config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
        lsh = LSHash(self.hash_size, self.input_dim, 1, config)
        for i in range(self.nb_elements):
            lsh.index(list(self.els[i]))
            lsh.index(
                list(self.els[i])
            )  # multiple insertions should be prevented by the library

        hasht = lsh.hash_tables[0]
        itms = [hasht.get_list(k) for k in hasht.keys()]

        for itm in itms:
            for el in itm:
                assert itms.count(
                    itm) == 1  # have multiple insertions been prevented?
                assert el in self.els

        for el in self.els:
            res = lsh.query(list(el), num_results=1,
                            distance_func='euclidean')[0]
            el_v, el_dist = res
            assert el_v in self.els
            assert el_dist == 0
        del lsh
Exemple #2
0
def create_feature(list_author, net):
    global example_image_dir
    list_feature = list()
    image_paths = list()
    ## Locality Sensitive Hashing
    k = 10  # hash size
    L = 5  # number of tables
    d = 58  # Dimension of Feature vector
    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)
    for subfolder in list_author.keys():
        subfolder_path = os.path.join(example_image_dir, subfolder)
        count_items = len([
            name for name in os.listdir(subfolder_path)
            if os.path.isfile(os.path.join(subfolder_path, name))
        ])
        # print(subfolder)
        sum_acc = 0
        sum_confiden = 0

        for img in os.listdir(subfolder_path):
            image_path = os.path.join(subfolder_path, img)
            author, confidence, feature = predict_author_single_img(
                net, image_path)
            image_paths.append(image_path)
            list_feature.append(feature)
            lsh.index(feature, extra_data=image_path)
    pickle.dump(lsh, open('lsh.p', "wb"))
    return lsh, image_paths, list_feature
Exemple #3
0
def knn(data_array, data, hash_size_input, data_shape):

    # init LSHash
    lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0])

    # index
    for col_index in range(data_shape[1]):
        lsh.index(data_array[:, col_index], extra_data=data.columns[col_index])

    # get a random pos
    vipno_pos = rd.randint(0, data_shape[1])

    # calculate and output
    for k in [1, 2, 3, 4, 5]:
        print 'hash size: %d' % hash_size_input
        print 'value k: %d' % k
        print 'target vipno: %d' % data.columns[vipno_pos]

        result = []
        for res in lsh.query(data_array[:, vipno_pos],
                             num_results=k + 1,
                             distance_func='euclidean'):
            result.append(res[0][1])

        print 'results: '
        print result[1:]
Exemple #4
0
def test_lshash_redis():
    """
    Test external lshash module
    """
    config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
    sr = StrictRedis(**config['redis'])
    sr.flushdb()

    lsh = LSHash(6, 8, 1, config)
    for i in xrange(num_elements):
        lsh.index(list(els[i]))
        lsh.index(list(els[i]))  # multiple insertions should be prevented by the library
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        for el in itm:
            assert itms.count(itm) == 1  # have multiple insertions been prevented?
            assert el in els
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        el_v, el_dist = res
        assert el_v in els
        assert el_dist == 0
    del lsh
    sr.flushdb()
Exemple #5
0
def test_lshash_redis_extra_val():
    """
    Test external lshash module
    """
    config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
    sr = StrictRedis(**config['redis'])
    sr.flushdb()

    lsh = LSHash(6, 8, 1, config)
    for i in xrange(num_elements):
        lsh.index(list(els[i]), el_names[i])
        lsh.index(list(els[i]), el_names[i])  # multiple insertions
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        assert itms.count(itm) == 1
        for el in itm:
            assert el[0] in els
            assert el[1] in el_names
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # vector an name are in the first element of the tuple res[0]
        el_v, el_name = res[0]
        # the distance is in the second element of the tuple
        el_dist = res[1]
        assert el_v in els
        assert el_name in el_names
        assert el_dist == 0
    del lsh
    sr.flushdb()
Exemple #6
0
 def test_lshash_extra_val(self):
     lsh = LSHash(self.hash_size,
                  self.input_dim,
                  1,
                  storage_config={'dict': None})
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]), self.el_names[i])
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         for el in itm:
             self.assertIn(el[0], self.els)
             self.assertIn(el[1], self.el_names)
     for el in self.els:
         # res is a list, so we need to select the first entry only
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # vector an name are in the first element of the tuple res[0]
         el_v, el_name = res[0]
         # the distance is in the second element of the tuple
         el_dist = res[1]
         self.assertIn(el_v, self.els)
         self.assertIn(el_name, self.el_names)
         self.assertEqual(el_dist, 0)
     del lsh
def traceLSHash(queryName, hashSize):
    #queryName ="hamming_query_12_3"
    #需要进行hashQuery的轨迹index
    indexList = [14, 249, 479, 689, 899]

    XYMatrix = DateTransform()

    resultList = []
    nearList = []

    lsh = LSHash(hashSize, 44107)
    tid = 1

    for traceList in XYMatrix:
        lsh.index(input_point=traceList, extra_data=tid)
        tid += 1

    resultFile = open(queryName + '.txt', 'w')

    for index in indexList:
        queryList = lsh.query(XYMatrix[index], distance_func="hamming")
        for result in queryList:
            resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str(
                result[1]) + "\n"
            nearList.append(result[0][1])
            resultFile.write(resultStr)
        resultList.append(nearList)
        nearList = []

    resultFile.close()

    writeHTML(resultList, queryName, "hashQuerry")
    print resultList
Exemple #8
0
def write_json_lsh(hash_size, grid):
    '''
    将生成的lsh路径放入json并存储
    :param hash_size: hash size列表
    :param grid: 处理完的栅格数组
    :return: none
    '''
    data_lsh = {}
    for size in hash_size:
        print size
        print 'list'
        data_lsh[size] = []
        lsh = LSHash(size, 44107)
        count = 0
        for line in grid:
            lsh.index(line, extra_data=count)
            count += 1
        for id in road_id:
            roads = []
            res = lsh.query(grid[id])
            print len(res)
            for r in res:
                roads.append(pack_data(r[0][1]))
            data_lsh[size].append({id: roads})

    with open('result_lsh.json', 'w') as f:
        f.write(str(data_lsh))
Exemple #9
0
 def test_lshash_redis_extra_val(self):
     """
     Test external lshash module
     """
     config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}}
     lsh = LSHash(self.hash_size, self.input_dim, 1, config)
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]), self.el_names[i])
         lsh.index(list(self.els[i]),
                   self.el_names[i])  # multiple insertions
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         assert itms.count(itm) == 1
         for el in itm:
             assert el[0] in self.els
             assert el[1] in self.el_names
     for el in self.els:
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # vector an name are in the first element of the tuple res[0]
         el_v, el_name = res[0]
         # the distance is in the second element of the tuple
         el_dist = res[1]
         assert el_v in self.els
         assert el_name in self.el_names
         assert el_dist == 0
     del lsh
Exemple #10
0
def k_nn_lsh(k, word, decade_matrix, index_dict):
    index_dict = dict(map(reversed, index_dict.items()))
    num_rows = decade_matrix.get_shape()[0]
    lsh = LSHash(6, num_rows)
    for i in range(num_rows):
        print(i)
        lsh.index(decade_matrix.getrow(i).todense())
    return lsh.query(word)
Exemple #11
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        lsh = LSHash(hash_size=32, input_dim=f, num_hashtables=100)
        for i in range(n):
            lsh.index(X[i], i)

        return lsh
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        lsh = LSHash(hash_size = 32, input_dim = f, num_hashtables = 100)
        for i in range(n):
            lsh.index(X[i], i)

        return lsh
Exemple #13
0
    def Mainfunc(self, mat_addr):
        np.set_printoptions(suppress=True, precision=6, threshold=8)
        s = sio.loadmat(mat_addr)
        svec = s['FFE']
        datalen = len(svec)
        n1, n2, n3 = np.shape(svec)
        data = np.zeros((n1, 87212))
        m = 0
        for i in range(n2):
            for j in range(n3):
                if svec[:, i, j].all() != 0:
                    data[:, m] = svec[:, i, j]
                    m = m + 1
        # print data[:,0]
        dataves = np.transpose(data)
        modelindex = list(set(np.random.randint(1, 87212, size=10000)))

        lsh_model = LSHash(7, n1)
        for jj in modelindex:
            lsh_model.index(dataves[jj, :])

        # if you want to test a program
        starttest = 1  # start test index
        endtest = 5
        testindex = random.sample(modelindex,
                                  1)  # SIZE IS THE NUMBER OF TEST FUNCTIONS

        test = np.zeros((len(testindex), n1))
        for i in range(len(testindex)):
            #  print dataves[testindex[i],:]
            test[i, :] = dataves[testindex[i], :]
        # print len(test)
        output = open('result.txt', 'w')
        timee = open('time.txt', 'w')
        for queryi in range(len(testindex)):
            if test[queryi, :].all() != 0:
                starttime = time.time()
                Atemp = lsh_model.query(test[queryi, :], 5, 'cosine')
                print(str(Atemp[0]).split(')')[0]).replace('(', '')
                output.write((str(Atemp[0]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[1]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[2]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[3]).split(')')[0]).replace('(', '') +
                             '\n')
                output.write((str(Atemp[4]).split(')')[0]).replace('(', '') +
                             '\n')

                endtime = time.time()
                timee.write(str(endtime - starttime) + '\n')
                # output.write(A)
                output.write('\n')

        output.close()
        timee.close()
def eventIdentification(dictionaryFile, corpusFile, outputFile):
	outputVector = []
	tempDict = {}
	
	corpus = corpora.MmCorpus(corpusFile)
	dictionary = corpora.Dictionary.load(dictionaryFile)
	#print "Unique Tokens:", dictionary.__len__()
	lsh = LSHash(20, dictionary.__len__())
	index = 0
	for index in range(len(corpus)):
		denseVector = getDenseVector(corpus[index], lsh.input_dim)
		result = lsh.query(denseVector)
		
		#print denseVector
		
		#no similar tweets
		if(result == []):
			#print "No Similar Tweets for: ", index
			tempDict[tuple(denseVector)] = len(outputVector)
			outputVector.append([index])
			lsh.index(denseVector)
			continue
		
		assigned = False
		for vector in result:
			if(getDistance(vector, denseVector) == True):
				ev = tempDict[tuple(vector[0])]
				outputVector[ev].append(index)
				tempDict[tuple(denseVector)] = ev
				#for ind in range(len(outputVector)):
					#done = False
					#for tweetNo in outputVector[ind]:
						#if (tweetNo == tempDict[tuple(vector[0])]):
							#outputVector[ind].append(index)
							#done = True
							#break
					#if done == True:
						#break
				assigned = True
				break
		
		if assigned == False:
			tempDict[tuple(denseVector)] = len(outputVector)
			outputVector.append([index])
			
		lsh.index(denseVector)
		
		
	with open(outputFile, 'w') as out:
		for vector in outputVector:
			line = ""
			for index in vector:
				line += "," + str(index)
			out.write(line[1:]+"\n")
	
	del outputVector
	del tempDict
Exemple #15
0
def get_lshash(filename):
    lsh = LSHash(30, 8)
    try:
        with open(filename) as f:
            content = f.readlines()
            content = [x.strip('\n') for x in content]
    except Exception as e:
        print("Cannot find the file.")

    for row in content:
        row = row.split(",")
        row = list(map(int, row))
        tmp = row[:8]
        lsh.index(tmp, str(row[8]))

    return lsh
def test():
    import utils

    trueIds, testSet = utils.load_test_set('fc7', 'raw', 0)

    lsh = LSHash(128, np.shape(testSet[0])[0], matrices_filename='lsh_planes.data.npz', overwrite=True)

    for idx, input_point in enumerate(testSet):
        hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist())
        print hastValue

        lsh.index(input_point, idx)

    print lsh.query(testSet[3], 3)

    return None
Exemple #17
0
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k):

    # init LSHash
    lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0])

    # index
    for col_index in range(data_shape[1]):
        lsh.index(data_array[:, col_index], extra_data=data.columns[col_index])

    # calculate and output
    result = []
    for res in lsh.query(data_array[:, vipno_pos],
                         num_results=k + 1,
                         distance_func='euclidean'):
        result.append(res[0][1])

    return result[1:]
Exemple #18
0
def lshTOfind(path):
    lsh = LSHash(50,361)
    f = open('newindex.csv')
    index = csv.reader(f)
    features = []
    count = 0
    for r in index:
        features = [int(float(i)) for i in r[1:]]
        lsh.index(features)
        count += 1
    try:
        f_v = getfeatures(path)
        ans = lsh.query(f_v)
        if ans != []:
            return searchid(int(ans[0][0][360]/10000))
    except:
        return []
Exemple #19
0
def test_lshash():
    lsh = LSHash(6, 8, 1)
    for i in xrange(num_elements):
        lsh.index(list(els[i]))
        lsh.index(list(els[i]))  # multiple insertions
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        assert itms.count(itm) == 1
        for el in itm:
            assert el in els
    for el in els:
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # res is a tuple containing the vector and the distance
        el_v, el_dist = res
        assert el_v in els
        assert el_dist == 0
    del lsh
Exemple #20
0
class LshIndexer(Indexer):
	PARAMETERS = {'hash_size': 6,
				  'input_dim': 128,
				  'num_of_hashtables': 1,
				  'storage': {'redis': {'host':'localhost', 'port': 6379}}}

	def initialize_store(self, parameters):
		self.store = LSHash(parameters['hash_size'],
							parameters['input_dim'],
							parameters['num_of_hashtables'],
							parameters['storage'])

	def index(self, features):
		for feature in features:
			self.store.index(feature.data, feature.file_id)

	def query(self, feature, num_results=5):
		return self.store.query(feature, num_results)
def classify_nearest_neighbor_lsh(k):
    lsh = LSHash(3, 12)
    labels = load_labels()

    for genre, song_genres_ids in labels.groupby('category'):
        print('Indexing genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2)):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            for val in song.values:
                lsh.index(val, extra_data=genre)

    total_count = 0
    match_count = 0
    for genre, song_genres_ids in labels.groupby('category'):
        print('Expected genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2), num_values):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            genre_freqs = {}

            split_song = np.array_split(song, 5,
                                        axis=0)  # Split song into sections
            for s in split_song:
                avg_song_val = np.mean(s)  # Take average of each section
                neighbours = lsh.query(avg_song_val, num_results=k)
                for neighbour in neighbours:
                    genre = neighbour[0][1]
                    genre_freqs[genre] = genre_freqs.get(genre, 0) + 1

            actual_genre = max(genre_freqs, key=genre_freqs.get)
            print('Predicted genre: {}'.format(actual_genre))
            total_count += 1
            if genre == actual_genre:
                match_count += 1

    print('Matched {} out of {} songs: {}%'.format(
        match_count, total_count, (match_count / total_count) * 100))
def detect_subevent(filename):
	dictionaryFile = filename + ".dict"
	corpusFile = filename + ".mm"
	outputFile = filename + ".out"
	outputVector = []
	tempDict = {}
	outputdict={}
	corpus = corpora.MmCorpus(corpusFile)
	dictionary = corpora.Dictionary.load(dictionaryFile)
	lsh = LSHash(30, dictionary.__len__())
	index = 0
	for index in range(len(corpus)):
		#print str(index)+",",
		#print corpus[index]
		denseVector = getDenseVector(corpus[index], lsh.input_dim)
		#print getSparseVector(denseVector)
		result = lsh.query(denseVector, num_results = 50, distance_func = "euclidean")
		#print result
		#no similar tweets
		
		if(result == []):
			outputdict[index]=[]
			tempDict[getSparseVector(denseVector)] = index
			lsh.index(denseVector)
			#continue
		
		else:
			for r in result:
				if(outputdict.has_key(tempDict[getSparseVector(r[0])])):
					outputdict[tempDict[getSparseVector(r[0])]].append(index)
					break
			
		
		
	#print outputdict
	with open(outputFile, 'w') as out:
		for key in outputdict.iterkeys():
			line = str(key) 
			for i in outputdict[key]:
				line += ", " + str(i)
			out.write(line+"\n")
	
	print "Please check the output file:", outputFile
Exemple #23
0
def test():
    import utils

    trueIds, testSet = utils.load_test_set('fc7', 'raw', 0)

    lsh = LSHash(128,
                 np.shape(testSet[0])[0],
                 matrices_filename='lsh_planes.data.npz',
                 overwrite=True)

    for idx, input_point in enumerate(testSet):
        hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist())
        print hastValue

        lsh.index(input_point, idx)

    print lsh.query(testSet[3], 3)

    return None
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle):
    f = file(bits_tid_pickle, "rb")
    data = pickle.load(f)
    f.close()
    #这里的参数可以调整,具体见https://github.com/kayzh/LSHash
    lsh = LSHash(13, 128, num_hashtables=1)
    map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys())
    out = file(lsh_pickle, "wb")
    pickle.dump(lsh, out, -1)
    out.close()
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle):
	f = file(bits_tid_pickle, "rb")
	data = pickle.load(f)
	f.close()
	#这里的参数可以调整,具体见https://github.com/kayzh/LSHash
	lsh = LSHash(13, 128, num_hashtables=1)
	map(lambda x:lsh.index(np.array([int(tmp) for tmp in x])), data.keys())	
	out = file(lsh_pickle,"wb")
	pickle.dump(lsh, out, -1)
	out.close()
Exemple #26
0
def create_feature(train_image_dir, classes, net):
    list_feature = list()
    image_paths = list()
    ## Locality Sensitive Hashing
    k = 10 # hash size
    L = 5  # number of tables
    d = 58 # Dimension of Feature vector
    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)
    for each_object in classes:
        each_object_path = os.path.join(train_image_dir, each_object)
        list_img = next(os.walk(each_object_path))[2]
        print("hashing class: ", each_object, " which has: ", len(list_img))
        for img in list_img:
            image_path = os.path.join(each_object_path, img)
            feature = get_feature_single_img(net, image_path)
            image_paths.append(image_path)
            list_feature.append(feature)
            lsh.index(feature, extra_data=image_path)
    return lsh, image_paths, list_feature
def index_room():
    '''
    lsh算法索引图片特征
    :return:
    '''
    files = glob("./data/features/*.csv")
    files_ids = [
        filename.split("\\")[-1].replace(".csv", "") for filename in files
    ]
    X = np.load("data/train.npy")
    X = X.reshape(X.shape[0], -1)
    encoder = load_model("data/encoder.h5")
    dimension = 100
    lsh_hash = LSHash(hash_size=32, input_dim=dimension)
    compress_feature = encoder.predict(X)
    for num, ele in enumerate(compress_feature.tolist()):
        lsh_hash.index(ele, extra_data=files_ids[num])
    with open("data/lsh.pkl", "wb") as fh:
        pickle.dump(lsh_hash, fh)
Exemple #28
0
def learn(routes):
    global global_training_route
    global next_hop_index
    
    extra_data_len = 2    #destination, next_hop
    ndims = len(routes[0]) - extra_data_len   #Number of dimensions
    hash_length = len(routes[0]) * 2   #arbitrarily chosen hash_length
    next_hop_index = len(routes[0]) - 1   #NextHop index at the last
    
    for i in range(0, len(routes) - 1):
        if(routes[i][next_hop_index] >= routes[i+1][next_hop_index]):
            routes[i][next_hop_index] = i+1
        else:
            routes[i][next_hop_index] = -1
    global_training_route = routes
    lsh = LSHash(hash_length, ndims)
    for entry in routes:
        lsh.index(entry[:-extra_data_len], extra_data = entry[-extra_data_len:])
    return lsh
Exemple #29
0
 def test_lshash(self):
     lsh = LSHash(self.hash_size, self.input_dim, 1)
     for i in range(self.nb_elements):
         lsh.index(list(self.els[i]))
         lsh.index(list(self.els[i]))  # multiple insertions
     hasht = lsh.hash_tables[0]
     itms = [hasht.get_list(k) for k in hasht.keys()]
     for itm in itms:
         self.assertEqual(itms.count(itm), 1)
         for el in itm:
             self.assertIn(el, self.els)
     for el in self.els:
         res = lsh.query(list(el), num_results=1,
                         distance_func='euclidean')[0]
         # res is a tuple containing the vector and the distance
         el_v, el_dist = res
         self.assertIn(el_v, self.els)
         self.assertEqual(el_dist, 0)
     del lsh
def detect_subevent(filename):
    dictionaryFile = filename + ".dict"
    corpusFile = filename + ".mm"
    outputFile = filename + ".out"
    outputVector = []
    tempDict = {}
    outputdict = {}
    corpus = corpora.MmCorpus(corpusFile)
    dictionary = corpora.Dictionary.load(dictionaryFile)
    lsh = LSHash(30, dictionary.__len__())
    index = 0
    count = 0
    for index in range(len(corpus)):
        #print str(index)+",",
        #print corpus[index]
        denseVector = getDenseVector(corpus[index], lsh.input_dim)
        #print getSparseVector(denseVector)
        result = lsh.query(denseVector, num_results=5, distance_func="cosine")
        #print result
        #no similar tweets
        count += 1
        if (result == []):
            outputdict[index] = []
            tempDict[getSparseVector(denseVector)] = index
            lsh.index(denseVector)
            #continue

        else:
            for r in result:
                if (outputdict.has_key(tempDict[getSparseVector(r[0])])):
                    outputdict[tempDict[getSparseVector(r[0])]].append(index)
                    break
        #print count,

    #print outputdict
    with open(outputFile, 'w') as out:
        for key in outputdict.iterkeys():
            line = str(key)
            for i in outputdict[key]:
                line += ", " + str(i)
            out.write(line + "\n")

    print "Please check the output file:", outputFile
Exemple #31
0
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle):
    f = file(bits_tid_pickle, "rb")
    data = pickle.load(f)
    f.close()

    # '10' means the bit binary (github.com/kayzh/LSHash)
    lsh = LSHash(13, 10, num_hashtables=1)
    map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys())
    out = file(lsh_pickle, "wb")
    pickle.dump(lsh, out, -1)
    out.close()
def getBuckets(fromFile):
    global nameDict
    global lsh
    nameDict = {}
    lsh = LSHash(bWidth,26, num_hashtables = num_ht)
    if fromFile:
        f = open(datafile, 'r')   
        nameList = f.readlines()
    else:
        nameList = surnames.dic.keys()
    for l in nameList:
        name = l.split(" ")[0].strip()
        nameArr = getvec(name)
        arrStr = toStr(nameArr)
        if arrStr in nameDict:
            nameDict[arrStr].append(name)
        else:
            nameDict[arrStr] = [name]
    for k in nameDict.keys():
        lsh.index(toArr(k))
Exemple #33
0
def test_lshash_extra_val():
    lsh = LSHash(6, 8, 1)
    for i in xrange(num_elements):
        lsh.index(list(els[i]), el_names[i])
    hasht = lsh.hash_tables[0]
    itms = [hasht.get_list(k) for k in hasht.keys()]
    for itm in itms:
        for el in itm:
            assert el[0] in els
            assert el[1] in el_names
    for el in els:
        # res is a list, so we need to select the first entry only
        res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0]
        # vector an name are in the first element of the tuple res[0]
        el_v, el_name = res[0]
        # the distance is in the second element of the tuple
        el_dist = res[1]
        assert el_v in els
        assert el_name in el_names
        assert el_dist == 0
    del lsh
def subEventDetection(dictionaryFile, corpusFile, outputFile):
	outputVector = []
	tempDict = {}
	corpus = corpora.MmCorpus(corpusFile)
	dictionary = corpora.Dictionary.load(dictionaryFile)
	lsh = LSHash(30, dictionary.__len__())
	index = 0
	for index in range(len(corpus)):
		denseVector = getDenseVector(corpus[index], lsh.input_dim)
		result = lsh.query(denseVector, num_results = 50, distance_func = "cosine")
		#no similar tweets
		if(result == []):
			outputVector.append([index])
			continue
		assigned = False
		for vector in result:
			if(getDistance(vector, denseVector) == True):
				for ind in range(len(outputVector)):
					done = False
					for tweetNo in outputVector[ind]:
						if (tweetNo == tempDict[vector]):
							outputVector[ind].append(index)
							done = True
							break
					if done == True:
						break
				assigned = True
				break
		if assiged == False:
			outputVector.append([index])
		lsh.index(denseVector)
		tempDict[tuple(denseVector)] = index
	with open(outputFile, 'w') as out:
		for vector in outputVector:
			line = ""
			for index in vector:
				line += ", " + str(index)
			out.write(line[2:]+"\n")
	print "Please check the output file:", outputFile
Exemple #35
0
def lshTOfind(path):
    lsh = LSHash(10, 360)
    f = open('copyindex.csv')
    index = csv.reader(f)
    features = []
    count = 0
    for r in index:
        features = [float(i) for i in r[1:]]
        lsh.index(features[:360], features[360])
        count += 1
    try:
        f_v = getfeatures(path)
        #print f_v
        ans = lsh.query(f_v[:360], 15)
        if ans != []:
            res = []
            for i in ans:
                res.append(int(i[0][1] / 10000))
            return res
        #searchid(int(ans[0][0][360]/10000))
    except:
        return []
Exemple #36
0
    def get_lshash(self):
        """Index all existing reactions based on specified headers into an lshash."""
        from lshash import LSHash
        headers = headers_to_use
        count = 0

        lsh = LSHash(1, len(headers))
        for i in PerformedReaction.objects.all().rows(True):
            to__index = []
            for header in headers:
                try:
                    to__index.append(i[header])
                except KeyError:
                    continue
            if len(to__index) == len(headers):
                lsh.index(to__index)
                count += 1
            else:
                pass

        print('count', count)
        self.lshash = lsh
        self.headers = headers
class Searcher:

    _DIST_FUNCTIONS = ["hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"]
    index = None

    def __init__(self, dataset):
        self.create_index(dataset)

    def create_index(self, items, hash_size=6):
        input_dim = len(items.values()[0])
        self.index = LSHash(hash_size, input_dim)
        for key in items:
            self.index.index(items[key], extra_data=key)
        return True

    def query(self, query_item, num_results=10, distance_function='cosine'):
        if distance_function not in self._DIST_FUNCTIONS:
            raise Exception("{0} not supported".format(distance_function))
        results = self.index.query(query_item, num_results=num_results, distance_func=distance_function)
        return self.parse_results(results)

    def parse_results(self, results):
        return {x[0][1]:x[1] for x in results}
Exemple #38
0
def filterDataset(fileIn, fileOut, fileNodes, threshold):
    '''
    Reads filteredTaxiData.txt and filters out lines that are farther away from every node in OSM graph ,
    by a threshold value (0.1 mile). A data entry will be kept if the distance of point is less than this threshold,
    from any node in OSM graph
    '''
    # Dimension of our vector space
    lsh = LSHash(hash_size=10, input_dim=2)

    nodes = GetNodes(fileNodes)
    for node in nodes:
        v = np.array(node, dtype=float)
        lsh.index(v)
    bunch = []
    bunch_size = 5000
    count_lines_read = 0
    count_lines_written = 0
    with open(fileIn, "r") as fin, open(fileOut, "w") as fout:
        for line in fin:
            [latitude, longitude] = dataToGraph.lineToPoint(line)
            query = np.array((latitude, longitude), dtype=float)
            result = lsh.query(query, num_results=1)
            closest_node = result[0][0]
            count_lines_read += 1
            if vin((latitude, longitude), closest_node).miles < threshold:
                line = replacePointByOSMnode(line, closest_node)
                bunch.append(line)
                if len(bunch) == bunch_size:
                    fout.writelines(bunch)
                    count_lines_written += len(bunch)
                    bunch = []
                    if (count_lines_written % 10 == 0):
                        print("%d written / %d read" %
                              (count_lines_written, count_lines_read))
        fout.writelines(bunch)
        count_lines_written += len(bunch)
        print("%d lines written" % count_lines_written)
Exemple #39
0
class feature_comparer():
    def __init__(self, fea_dim, compare_thresh):
        self.lsh = LSHash(bit_num, fea_dim, compare_kernel_num)
        self.fv_dict = {}
        self.compare_thresh = compare_thresh

    def load(self, filename):
        f = open(filename, 'r')
        while (1):
            line = f.readline()
            if not line:
                break

            fv = line.split(':')[0]
            id = line.split(':')[1]
            self.fv_dict[fv] = id

            fv_array = []
            s = fv[1:-1].split(',')
            for i in range(0, len(s)):
                fv_array.append(float(s[i]))
            self.lsh.index(fv_array)

    def insert(self, feature, id):
        self.fv_dict[str(feature)[1:-1]] = str(id)
        self.lsh.index(feature)

    def match(self, feature):
        q = self.lsh.query(feature, distance_func='cosine')
        if len(q) == 0:
            return False, -1
        mindis = q[0][1]
        if mindis < self.compare_thresh:
            return True, self.fv_dict[str(q[0][0])[1:-1]]
        else:
            return False, -1
def hash_item_pic_v1(pic_folder):
    """
    当前版本采用HardNet直接进行特征输出,并且对整张图作为特征区域进行特征向量输出,会在图片所在同级
    目录输出一个图片与hash值编码的映射文件

    这种方案出来的结果是如果图片有平移则特征向量会有差距

    :param pic_folder:  所有图片所在文件夹
    :return:    是否成功
    """
    try:
        # 计算所有图片的特征向量
        desc = HardNetDescriptor()
        print(colored("HardNet模型加载完成", color='blue'))
        # 使用LSH
        lsh = LSHash(16, 128)
        img_feature_vector = {}
        with open(pic_folder + '_item_hash.txt', 'w') as to_write:
            img_file_list = glob(os.path.join(pic_folder, '*_[0-9].jpg'))
            for m_img_file in tqdm(img_file_list, desc='训练中'):
                fv = desc.describle([
                    np.array(
                        Image.open(m_img_file).convert('L').resize((32, 32))),
                ])[0]
                img_feature_vector[m_img_file] = fv
                lsh.index(fv, extra_data=m_img_file)
            for m_img_file in tqdm(img_file_list, desc='输出中'):
                res = lsh.query(img_feature_vector[m_img_file],
                                distance_func='centred_euclidean')
                # 输出所有临近的图片
                print(m_img_file, '|'.join(map(lambda x: x[0][1], res)))
        pass
        return True
    except Exception as e:
        print(colored("错误:%s" % str(e), color='red'))
        return False
Exemple #41
0
def main(argv):
    parser = argparse.ArgumentParser(prog='INDEX')
    parser.add_argument('source', help='path to the source metadata file')
    parser.add_argument('--hash-size', help='Hash size.', type=int, default=10)
    parser.add_argument('--num-tables',
                        help='Number of tables.',
                        type=int,
                        default=5)
    parser.add_argument('--query-index',
                        help='Index to use for query.',
                        type=int,
                        default=0)

    args = parser.parse_args(argv[1:])

    # read in the data file
    data = pandas.read_csv(args.source, sep='\t')

    # params
    k = args.hash_size  # hash size
    L = args.num_tables  # number of tables
    d = len(data['features'][0].split(','))

    lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)

    # indexing
    for i in range(0, len(data)):
        lsh.index(np.asarray(data['features'][i].split(',')).astype('float64'),
                  extra_data=data['filename'][i])

    # query a vector q_vec
    response = lsh.query(
        np.asarray(
            data['features'][args.query_index].split(',')).astype('float64'))

    pprint(response)
Exemple #42
0
from lshash import LSHash
lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=1, storage_config={"lmdb": {'path': '/Users/christianburger/Downloads/testlmdb'}})
lsh.index([1,2,3,4,5,6,7,8], 'a')
lsh.index([2,3,4,5,6,7,8,9], 'b')
lsh.index([10,12,99,1,5,31,2,3], 'c')
print lsh.query([1,2,3,4,5,6,7,7])
from __future__ import print_function
from __future__ import division

from scipy.spatial.distance import cosine
from tqdm import tqdm
import numpy
from lshash import LSHash
import time

start = time.time()
lsh = LSHash(8, 300)
sample_word_embeds = []
for i in tqdm(xrange(20000)):
    word_embed = numpy.random.rand(300)
    lsh.index(word_embed)

    if i % 500 == 0:
        sample_word_embeds.append(word_embed)

print("Indexing takes {} seconds".format(time.time() - start))

start = time.time()
for word_embed in sample_word_embeds:
    print('-' * 80)
    results = lsh.query(word_embed, num_results=None, distance_func='cosine')
    print("Num result: {}".format(len(results)))
    print('Nearest neighbor cosine distance:')
    print("    {} | {}".format(results[1][1], cosine(results[1][0], word_embed)))

print('Query takes average {} seconds'.format((time.time() - start) / len(sample_word_embeds)))
    mid14 = '00014.mid'
    mid15 = '00015.mid'
    mid16 = '00016.mid'
    mid17 = '00017.mid'
    mid18 = '00018.mid'
    mid19 = '00019.mid'
    mid20 = '00020.mid'
    
    s1 = 'yldw.mid'
    s2 = 'alphaville-forever_young.mid'
    s3 = 'counting_stars.mid'
    s4 = 'baba-go.mid'
    

    for note,name in note_from_midi(s1):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(s2):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(s3):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(s4):
        lsh.index(note,extra_data=(name,0.8))

    for note,name in note_from_midi(mid1):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(mid2):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(mid3):
        lsh.index(note,extra_data=(name,0.8))
    for note,name in note_from_midi(mid4):
        lsh.index(note,extra_data=(name,0.8))
Exemple #45
0
def run():
    initial = True
    size = 2000
    tweet_ids = []
    tweet_text = []
    counter = 0
    num_hashtables = 13      ## recompute the random vectors if this is changed
    dimension = 50000       ## recompute the random vectors if this is changed
    hash_size = 13          ## length of the LSHash of the tweets
    bucket_size = 100       ## size of the queue for each hash in the hash tables
    comparisons = 50       ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
    cos_threshold = .7      ## threshold for the similarity of two tweets

    ## initialize the tf-idf vectorizer
    vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
    ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
    lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)


    clusters = {}           ## maintain the clusters
    num_clusters = 0
    inv_index = {}          ## inverse mapping from tweet_id to clusters
    Y = None
    Y1 = None
    f_d = open("output.txt",'w')
    loc = "/Users/dilpreet/Documents/mtp_documents/markedData/data/"
    for root, dirs, filenames in os.walk(loc):
        for f in filenames:
            with open(loc+f) as infile:
                for line in infile:

                    ## load 2000 tweets at a time 
                    tweet = json.loads(line)
                    tweet_ids.append(tweet['id'])
                    tweet_text.append(tweet['text'])
                    counter = counter + 1
                    t2 = 0
                    if counter%size == 0:
                        t1 = time.clock()

                        ## X contains te tf-idf score of the tweets in the "sparse row matrix" format
                        if initial:
                            X = vectorizer.fit_transform(tweet_text)
                        else:
                            X = vectorizer.transform(tweet_text)
                        print X.get_shape()
                        print len(vectorizer.vocabulary_)

                        ## if the total number of keywords exceed the pre-specified dimension, raise error
                        if X.get_shape()[0] > dimension:
                            print X.get_shape()
                            print "dimension exceeded"
                            raise
                        for i in range(X.get_shape()[0]):
                            temp_tweet = X.getrow(i)

                            ## query for the nearest neighbor from the lshash tables
                            nn = lsh.arpoxNN(temp_tweet, L=comparisons)
                            c = 2
                            scase = False

                            ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster

                            if nn is not None:
                                ((a, b),c) = nn
                                if c <= cos_threshold:
                                    inv_index[tweet_ids[i]] = inv_index[b]
                                    clusters.setdefault(inv_index[b],[]).append(tweet_ids[i])
                                #else:
                                #    scase = True

                            ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
                            """ code to linearly search through the tweets"""
                            if (c > cos_threshold or nn is None or scase):
                                searchY = False

                                if (i==0 and not initial):
                                    searchY = True
                                if (i==0 and initial):
                                    inv_index[tweet_ids[i]] = num_clusters
                                    clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                    num_clusters = num_clusters + 1
                                if (i!=0):
                                    Z = X[:i]
                                    #print temp_tweet.shape
                                    t2 = temp_tweet.transpose()
                                    #print i
                                    a1 = Z.dot(t2).toarray()
                                    a2 = Z.multiply(Z).sum(axis = 1)
                                    a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
                                    a2 = sp.csc_matrix(a2).toarray()
                                    b = [j for j in range(Z.shape[0])]
                                
                                    a = min(b, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
                                    #a = min(Z, key = lambda x: cosine_dist(x[0], temp_tweet))
                                    #print a
                                    t3 = tweet_ids[a]
                                    if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))> cos_threshold:
                                        if not initial and i != size-1:
                                            searchY = True
                                        else:
                                            inv_index[tweet_ids[i]] = num_clusters
                                            clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                            num_clusters = num_clusters + 1
                                    else:
                                        inv_index[tweet_ids[i]] = inv_index[t3]
                                        clusters.setdefault(inv_index[t3], []).append(tweet_ids[i])
                                if searchY == True:
                                    Z = Y[i:]
                                    t2 = temp_tweet.transpose()
                                    #print i
                                    a1 = Z.dot(t2).toarray()
                                    a2 = Z.multiply(Z).sum(axis = 1)
                                    a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
                                    a2 = sp.csc_matrix(a2).toarray()
                                    b1 = [j for j in range(Z.shape[0])]
                                    a = min(b1, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
                                    t3 = Y1[a + i]
                                    if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))< cos_threshold:
                                        inv_index[tweet_ids[i]] = inv_index[t3]
                                    else:
                                        inv_index[tweet_ids[i]] = num_clusters
                                        clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                        num_clusters = num_clusters + 1

                            ### index the tweet into the hsh tables
                            lsh.index(input_point = temp_tweet, extra_data = tweet_ids[i])
                        initial = False
                        Y = X
                        Y1 = tweet_ids[:]
                        tweet_ids = []
                        tweet_text = []
                        print counter
                        print time.clock() - t1
                        f2 = open('time.txt','a')
                        f2.write(str(time.clock()-t1) + '\n')
                        f2.close()
                        if counter%10000==0:
                            f2 = open('result.txt', 'a')
                            f2.write(json.dumps(clusters) + "\n")
                            f3 = open('vocab.txt', 'a')
                            f4 = open('vectorizer.txt', 'a')
                            f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
                            f4.write(json.dumps(vectorizer.idf_) + "\n")
                            #print clusters
                            #print vectorizer.vocabulary_
                            f2.close()
                            f3.close()
                            f4.close()

    f2 = open('result.txt', 'w')
    f2.write(json.dumps(clusters) + "\n")
    f3 = open('vocab.txt', 'w')
    f4 = open('vectorizer.txt', 'w')
    f5 = open('inv_index.txt', 'w')
    f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
    f4.write(json.dumps(vectorizer.idf_) + "\n")
    f5.write(json.dumps(inv_index))
    #print clusters
    #print vectorizer.vocabulary_
    f2.close()
    f3.close()
    f4.close()
    f5.close()
Exemple #46
0
dim = len(Index) + 1 # -1 for excluded

data = xlrd.open_workbook(fname)
sht = data.sheet_by_name(shtname)
head = sht.row_values(0)
tweets = sht.col_values(head.index(target), start_rowx)

hash_size = int(np.ceil(np.log2(len(tweets))))
print 'hash_size: %d, dim: %d' %(hash_size, dim)
lsh=LSHash(hash_size, dim)

for tweet in tweets:
    x = spar.csr_matrix((1,dim) ,dtype=np.int8)
#    x = np.zeros(dim, np.bool8)
    ws = jieba.cut(tweet)
    try:
        for w in ws:
            x[Index.get(w, -1)] = 1
        lsh.index(x)
    except Exception, e:
        print e
        print tweet

sent = True
while sent:
    sent = raw_input('input sentence...\n')
    res = lsh.query(sent, distance_func = 'hamming')
    for i in res:
        print i[0], i[-1]

Exemple #47
0
class LshManager(object):

    def __init__(self):
        self.lshIndexList = []


        # create a list of lsh indexes
        self.lsh = LSHash(NUMBER_OF_BITS_PER_HASH, NUM_TOPICS, num_hashtables=NUMBER_OF_LSH_INDEXES,
                          storage_config={"redis": {"host": "localhost", "port": 6379}})

    def clearIndex(self):
        redis.Redis().flushall()

    # adds a document to all lsh indexes
    def addDocument(self, document):
        lsa_vector = document.vectors["LSA"]

        dense_vector = self._sparseToDenseConverter(lsa_vector)

        if not hasattr(document, "timestamp"):
            document.timestamp = str(datetime.datetime.now())

        extra = json.dumps(str(document._id))

        # detect duplicates
        #result = self.lsh.query(dense_vector, num_results=1, distance_func="cosine")
        #if result:
        #    nearest = result[0]
        #    if nearest[1] > DUPLICATE_SIMILARITY_THRESHOLD:
        #        extra = ast.literal_eval(ast.literal_eval(nearest[0])[1])
        #        doctitle = getDatabaseConnection().holist.articles.find({"_id": extra}).next()["title"]
        #        ln.warn("Detected duplicate for %s (ID %s): %s.", document.title, document._id, extra)
        #        return

        self.lsh.index(dense_vector, extra_data=extra)  # extra MUST be hashable

    # takes a document and returns database ids of similar documents
    # uses cosine function to determine similarity
    def getSimilarDocuments(self, document, num_docs=7):
        if isinstance(document, Document):
            lsa_vector = document.vectors["LSA"]
        else:
            lsa_vector = document

        dense_vector = self._sparseToDenseConverter(lsa_vector)

        client = getDatabaseConnection()

        resultSet = set()
        results = []

        for result in self.lsh.query(dense_vector, num_results=num_docs, distance_func="cosine"):
            # example:
            # [
            #   (((1, 2, 3), "{'extra1':'data'}"), 0),
            #   (((1, 1, 3), "{'extra':'data'}"), 1)
            # ]
            extra = ast.literal_eval(ast.literal_eval(result[0])[1])

            clientDoc = bsonToClientBson(client.holist.articles.find({"_id": extra}).next())
            clientDoc['lsa'] = self._sparseToDenseConverter(clientDoc['lsa'])
            jsonstr = json.dumps(clientDoc)

            if not jsonstr in resultSet:
                resultSet.add(jsonstr)
                results.append(clientDoc)

        ln.debug("retrieved %s documents.", len(results))
        return results

    # converts a vector in sparse format to a vector in dense format
    def _sparseToDenseConverter(self, sparseVector):
        dense = {}
        for x in range(NUM_TOPICS):
            dense[x] = 0

        for dim, val in sparseVector:
            dense[dim] = val
        return [value for key, value in dense.items()]
#vector = np.matrix(onedarray)
big_array=[]
image_number=[]

#hist = cv2.calcHist([img],[0],None,[256],[0,256])
#hist,bins = np.histogram(pixel.ravel(),256,[0,256])

lsh = LSHash(3, 255)
for x in range(1, 100000):
    img = Image.open(dataset+str(x)+fileext)
    pixel = np.array(img)
    #onedarray = pixel.ravel()
    hist,bins = np.histogram(pixel.ravel(),256,[0,256])
    listing=list(hist[0:255])
    big_array.append(listing)
    lsh.index(listing)

input_array=np.array(big_array)

img = Image.open(queryset+"10"+fileext)
pixel = np.array(img)
#onedarray = pixel.ravel()
hist,bins = np.histogram(pixel.ravel(),256,[0,256])
listing=list(hist[0:255])
k=lsh.query(listing,distance_func="l1norm")
vector = np.matrix(k)
length=len(k)
if length > 0:
    for output in range(length):
        if (k[output][1] < 800):
            test=np.array(k[output][0])
				fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) )))
				#------------------------------------------------------------------------------


				# indexing all trajectories
				print '\nStarting the indexing procedure ...'
				fileContainer.write(str( 'Time before indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) )	))
				queryDictionary = {}
				numberOfPoints = 0
				for i, trajectory in enumerate(trajectoriesContainer):
					for j, point in enumerate(trajectory[:-((dimensionNumber-2)/2)]):
						involvedPoints = [point[0],point[1]]
						for s in range(j+1, j+1+((dimensionNumber-2)/2)):
							involvedPoints.append(trajectory[s][0])
							involvedPoints.append(trajectory[s][1])
						hash = newLsh.index((involvedPoints), loadF=numberRadius)
						if queryDictionary.has_key(hash):
							queryDictionary[hash].add(i)
						else:
							queryDictionary[hash] = set()
							queryDictionary[hash].add(i)
						numberOfPoints += 1
				print len(trajectoriesContainer)
				fileContainer.write('\nThe following is the hash table used for querying or clustering ...')
				fileContainer.write(str(queryDictionary))
				fileContainer.write('\n')
				fileContainer.write(str( 'The number of generated buckets is : '+ str(len(queryDictionary.keys()))))
				fileContainer.write('\n')
				fileContainer.write(str( 'Time after indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) )	))
				fileContainer.write('\n')
				fileContainer.write(str( 'The number of point have beenindexed is : '+ str(numberOfPoints)))
def run():
    initial = True
    size = 200000
    tweet_ids = []
    tweet_text = []
    counter = 0
    num_hashtables = 4      ## recompute the random vectors if this is changed
    dimension = 5000000      ## recompute the random vectors if this is changed
    hash_size = 13          ## length of the LSHash of the tweets
    bucket_size = 100       ## size of the queue for each hash in the hash tables
    comparisons = 50       ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
    cos_threshold = .7      ## threshold for the similarity of two tweets

    ## initialize the tf-idf vectorizer
    vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
    ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
    lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)

    clusters = {}           ## maintain the clusters
    num_clusters = 0


    completed = open('/tmp/completed_tmp.txt')
    completed = completed.readlines()
    completed = set([x.replace('\n', '') for x in completed])

    while(True):
        clusters_size_prev = {}
        files = []
        for root, dirs, filenames in os.walk('/tmp/tweets_tmp/'):
            for fname in filenames:
                if fname != '.DS_Store':
                    files.append(fname)
        files = set(files)
        files = files - completed
        if len(files) == 0:
            print 'sleeping'
            time.sleep(3000)
            print 'checking'
            continue
        #print files
        tweets_dump = {}
        tweet_ids = []
        tweet_text = []
        time_sleep = time.time()
        for fn in files:
            print fn
            time_tmp2 = time.time()
            with open('/tmp/tweets_tmp/' + fn) as infile:
                for line in infile:
                    ## load 2000 tweets at a time 
                    
                    tweet = json.loads(line)
                    tweet_ids.append(tweet['id'])
                    tweet_text.append(tweet['filtered_text'])
                    tweets_dump[str(tweet['id'])] = tweet['text']

                    counter = counter + 1
                    t2 = 0
                    if counter%size == 0:
                        t1 = time.clock()

                        ## X contains te tf-idf score of the tweets in the "sparse row matrix" format
                        if initial:
                            X = vectorizer.fit_transform(tweet_text)
                        else:
                            X = vectorizer.transform(tweet_text)
                        #print X.get_shape()
                        #print len(vectorizer.vocabulary_)

                        ## if the total number of keywords exceed the pre-specified dimension, raise error
                        if X.get_shape()[0] > dimension:
                            print X.get_shape()
                            print "dimension exceeded"
                            raise
                        for i in range(X.get_shape()[0]):

                            temp_tweet = X.getrow(i)

                            ## query for the nearest neighbor from the lshash tables
                            nn = lsh.arpoxNN(temp_tweet, L=comparisons)
                            c = 2
                            scase = False

                            ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
                            cluster_id = -1
                            if nn is not None:
                                ((a, (b,d)),c) = nn
                                if c <= cos_threshold:
                                    cluster_id = d
                                    clusters.setdefault(d,[]).append(tweet_ids[i])
                                #else:
                                #    scase = True

                            ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
                            """ code to linearly search through the tweets"""
                            if (c > cos_threshold or nn is None or scase):
                                cluster_id = num_clusters
                                clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                num_clusters = num_clusters + 1

                            ### index the tweet into the hsh tables
                            lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id]))
                        initial = False
 
                        tweet_ids = []
                        tweet_text = []
                        #print counter
                        #print time.clock() - t1
                        f2 = open('time.txt','a')
                        f2.write(str(time.clock()-t1) + '\n')
                        f2.close()
                        """
                        if counter%100000==0:
                            f2 = open('result.txt', 'w')
                            f2.write(json.dumps(clusters) + "\n")
                            f3 = open('vocab.txt', 'w')
                            f4 = open('vectorizer.txt', 'w')
                            f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
                            f4.write(json.dumps(vectorizer.idf_) + "\n")
                            #print clusters
                            #print vectorizer.vocabulary_
                            f2.close()
                            f3.close()
                            f4.close()
                        """
            print 'done'
            print counter
            print str(time.time() - time_tmp2)
            f = open('/tmp/completed_tmp.txt', 'a')
            f.write(fn + '\n')
            f.close()
            completed.add(fn)
        print "all done"
        time_temp = time.time()
        if not os.path.exists('/home/y/share/htdocs/clusters/' + str(time_temp)):
            os.makedirs('/home/y/share/htdocs/clusters/' + str(time_temp))
        if not os.path.exists('/home/y/share/htdocs/clusters/current'):
            os.makedirs('/home/y/share/htdocs/clusters/current')
        
        clusters_size = {}

        for x in clusters:
            clusters_size[x] = len(clusters[x])
        f = open('/home/y/share/htdocs/clusters/' + str(time_temp) + '/sizes.txt', 'w')
        f.write(json.dumps(clusters_size))
        f.close()
        f = open('/home/y/share/htdocs/clusters/current/sizes.txt', 'w')
        f.write(json.dumps(clusters_size))
        f.close()
        cls = clusters_size.keys()
        cls.sort(key = lambda x : -1 * clusters_size[x])
        cl = []
        for x in cls:
            if clusters_size[x] >=10:
                cl.append(x)
        arr = []
        for i in range(len(cl)):
            write_clusters(i, cl, clusters, tweets_dump, time_temp, '/home/y/share/htdocs/clusters')
            arr.append(cl[i])
        f = open('/home/y/share/htdocs/clusters/' + str(time_temp) + '/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/clusters/current/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/clusters/list.txt', 'a')
        f.write(str(time_temp) + '\n')
        f.close()

        if not os.path.exists('/home/y/share/htdocs/ratio_clusters/' + str(time_temp)):
            os.makedirs('/home/y/share/htdocs/ratio_clusters/' + str(time_temp))
        if not os.path.exists('/home/y/share/htdocs/ratio_clusters/current'):
            os.makedirs('/home/y/share/htdocs/ratio_clusters/current')

        ratio = {}
        for x in clusters_size:
            if clusters_size[x]>=10:
                r = 1
                if (x in clusters_size_prev and clusters_size_prev[x] != 0):
                        r = clusters_size_prev[x]
                ratio[x] = clusters_size[x]*1.0/r
        ratio_keys = ratio.keys()
        ratio_keys.sort(key = lambda x : -1 * ratio[x])
        ratio_keys = ratio_keys[:300]
        arr = []
        for i in range(len(ratio_keys)):
            write_clusters(i, ratio_keys, clusters, tweets_dump, time_temp, '/home/y/share/htdocs/ratio_clusters')
            arr.append(ratio_keys[i])
        f = open('/home/y/share/htdocs/ratio_clusters/' + str(time_temp) + '/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/ratio_clusters/current/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/ratio_clusters/list.txt', 'a')
        f.write(str(time_temp) + '\n')
        f.close()

        clusters_size_prev = {}
        for x in clusters_size:
            clusters_size_prev[x] = clusters_size[x]

        clusters = {}
        time.sleep(max(0, 3600 - (time.time() - time_sleep)))
Exemple #51
0
def run():
    initial = True
    size = 2000
    tweet_ids = []
    tweet_text = []
    counter = 0
    num_hashtables = 5      ## recompute the random vectors if this is changed
    dimension = 5000000      ## recompute the random vectors if this is changed
    hash_size = 13          ## length of the LSHash of the tweets
    bucket_size = 100       ## size of the queue for each hash in the hash tables
    comparisons = 50       ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
    cos_threshold = .5      ## threshold for the similarity of two tweets

    ## initialize the tf-idf vectorizer
    vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
    ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
    lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)


    clusters = {}           ## maintain the clusters
    num_clusters = 0
    Y = None
    Y1 = None
    f_d = open("output.txt",'w')
    loc = "processed_tweets/"
    for root, dirs, filenames in os.walk(loc):
        for f in filenames:
            with open(loc+f) as infile:
                for line in infile:

                    ## load 2000 tweets at a time 
                    tweet = json.loads(line)
                    tweet_ids.append(tweet['id'])
                    tweet_text.append(tweet['text'])
                    counter = counter + 1
                    t2 = 0
                    if counter%size == 0:
                        t1 = time.clock()

                        ## X contains te tf-idf score of the tweets in the "sparse row matrix" format
                        if initial:
                            X = vectorizer.fit_transform(tweet_text)
                        else:
                            X = vectorizer.transform(tweet_text)
                        print X.get_shape()
                        print len(vectorizer.vocabulary_)

                        ## if the total number of keywords exceed the pre-specified dimension, raise error
                        if X.get_shape()[0] > dimension:
                            print X.get_shape()
                            print "dimension exceeded"
                            raise
                        for i in range(X.get_shape()[0]):

                            temp_tweet = X.getrow(i)

                            ## query for the nearest neighbor from the lshash tables
                            nn = lsh.arpoxNN(temp_tweet, L=comparisons)
                            c = 2
                            scase = False

                            ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
                            cluster_id = -1
                            if nn is not None:
                                ((a, (b,d)),c) = nn
                                if c <= cos_threshold:
                                    cluster_id = d
                                    clusters.setdefault(d,[]).append(tweet_ids[i])
                                #else:
                                #    scase = True

                            ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
                            """ code to linearly search through the tweets"""
                            if (c > cos_threshold or nn is None or scase):
                                cluster_id = num_clusters
                                clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                num_clusters = num_clusters + 1

                            ### index the tweet into the hsh tables
                            lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id]))
                        initial = False
                        Y = X
                        Y1 = tweet_ids[:]
                        tweet_ids = []
                        tweet_text = []
                        print counter
                        print time.clock() - t1
                        f2 = open('time.txt','a')
                        f2.write(str(time.clock()-t1) + '\n')
                        f2.close()
                        if counter%100000==0:
                            f2 = open('result.txt', 'a')
                            f2.write(json.dumps(clusters) + "\n")
                            f3 = open('vocab.txt', 'a')
                            f4 = open('vectorizer.txt', 'a')
                            f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
                            f4.write(json.dumps(vectorizer.idf_) + "\n")
                            #print clusters
                            #print vectorizer.vocabulary_
                            f2.close()
                            f3.close()
                            f4.close()
Exemple #52
0
print 'extracting features ...'
t1 = time.time()
for i in files:
    responses.append(CHARS.index(i.split(".")[-2].decode("utf8")) + 1)
    samples.append(leargist.color_gist(Image.open(i), nblocks=BLOCKS, orientations=ORIENTATIONS))
t2 = time.time()
print 'done, %d file took %0.3f ms' % (len(files), (t2 - t1) * 1000.0)


train_n = int(len(files)*0.5)

lsh = LSHash(3, DIMENSION, num_hashtables=5)
print "indexing ..."
t1 = time.time()
for i, sample in enumerate(samples[:train_n]):
    lsh.index(sample, extra_data=responses[:train_n][i])
t2 = time.time()
print "done. %d files took %0.3f ms" % (train_n, (t2 - t1) * 1000.0)




################## test ##########################

print "testing ..."

#correct = 0
#total = 0
#t1 = time.time()
#for i, sample in enumerate(samples[:train_n]):
#    total = total + 1
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 04 15:29:56 2015

@author: MaGesh
"""
import numpy as np
from scipy.ndimage import imread
from lshash import LSHash
lsh=LSHash(20,32*32) #32*32 is the dimension with 20 hash buckets
resultSet=[]
for i in range(1,100001):
    print i;
    X="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\dataset\\"+str(i)+".bmp"
    im=imread(X,flatten=True)
    single_array=im.flatten()
    lsh.index(single_array)#hashing the each values in to the bucket
for i in range(1,11):
    print i,"for querying"    
    X1="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\Query\\"+str(i)+".bmp"
    imQ=imread(X1,flatten=True) #converting to grey scale
    imFlatten=imQ.flatten()
    value=lsh.query(imFlatten,distance_func="euclidean") #querying the nearest points
    resultSet.append(value)
trajectoriesContainer = []
for i in range(datasetSize):
	trajectoriesContainer.append([(mat.values()[0][i][0][0][j], mat.values()[0][i][0][1][j]) for j in range(len(mat.values()[0][i][0][0]))])
allPoints = []
fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) )))
#------------------------------------------------------------------------------


# indexing all trajectories
print '\nStarting the indexing procedure ...'
fileContainer.write(str( 'Time before indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) )	))
queryDictionary = {}
numberOfPoints = 0
for i, trajectory in enumerate(trajectoriesContainer):
	for point in trajectory:
		hash = newLsh.index(point, loadF=numberRadius)
		if queryDictionary.has_key(hash):
			queryDictionary[hash].add(i)
		else:
			queryDictionary[hash] = set()
			queryDictionary[hash].add(i)
		numberOfPoints += 1

fileContainer.write('\nThe following is the hash table used for querying or clustering ...')
fileContainer.write(str(queryDictionary))
fileContainer.write('\n')
fileContainer.write(str( 'The number of generated buckets is : '+ str(len(queryDictionary.keys()))))
fileContainer.write('\n')
fileContainer.write(str( 'Time after indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) )	))
fileContainer.write('\n')
fileContainer.write(str( 'The number of point have beenindexed is : '+ str(numberOfPoints)))
trajectoriesContainer = []
for i in range(datasetSize):
    trajectoriesContainer.append([(mat.values()[0][i][0][0][j], mat.values()[0][i][0][1][j]) for j in range(len(mat.values()[0][i][0][0]))])
allPoints = []
fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) )))
#------------------------------------------------------------------------------


# indexing all trajectories
print '\nStarting the indexing procedure ...'
fileContainer.write(str( 'Time before indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) )	))
queryDictionary = {}
numberOfPoints = 0
for i, trajectory in enumerate(trajectoriesContainer):
    for point in trajectory:
        hash = newLsh.index(point, loadF=numberRadius)
        if queryDictionary.has_key(hash):
            queryDictionary[hash].add(i)
        else:
            queryDictionary[hash] = set()
            queryDictionary[hash].add(i)
        numberOfPoints += 1

fileContainer.write('\nThe following is the hash table used for querying or clustering ...')
fileContainer.write(str(queryDictionary))
fileContainer.write('\n')
fileContainer.write(str( 'The number of generated buckets is : '+ str(len(queryDictionary.keys()))))
fileContainer.write('\n')
fileContainer.write(str( 'Time after indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) )	))
fileContainer.write('\n')
fileContainer.write(str( 'The number of point have beenindexed is : '+ str(numberOfPoints)))
Exemple #56
0
from lshash import LSHash
import numpy as np

s = LSHash(10, 8)
s.index([1,2,3,4,5,6,7,8])
print s.hash_tables[0].keys()[0]
Exemple #57
-1
def create_hash2img():
    img2gist = get_img2gist()
    lsh = LSHash(hash_len, 960, storage_config=redis_config,
                 matrices_filename=matrices_file)
    count = 0
    total_num = len(img2gist)
    for name, gist_v in img2gist.iteritems():
        count += 1
        lsh.index(gist_v, name)
        sys.stdout.write('%d/%d\r    ' % (count, total_num))
        sys.stdout.flush()

    print 'bucket ratio: %d/%d' % (len(lsh.hash_tables[0].keys()), 2 ** hash_len)
    return lsh