def bin_test(n_q=[query_size],n_dts=dataset_size,K=10,L=128,S=30,prune=0): name='bin' d = np.random.randint(256,size=(n_dts,bin_bytes)).astype(np.uint8) i = pykgraph.KGraph() start= time.time() end= time.time() buildTime=end-start i.save('tempIndex') stat = os.stat('tempIndex') indexSize=stat.st_size q=gen_query_bin(n_q) start= time.time() i.search(d,q,K=K) end= time.time() searchTime=end-start req={} req['querySize']=n_q req['datasetSize']=n_dts req['K']=K req['L']=L req['S']=S req['prune']=prune res={} res['buildTime']=buildTime res['unitBuildTime']=buildTime/n_dts res['searchTime']=searchTime res['unitSearchTime']=searchTime/n_q res['indexSize']=indexSize res['unitIndexSize']=indexSize/n_dts return {'request': req , 'response': res}
def main(argv): # A csv filename datafile = '' try: opts, args = getopt.getopt(argv, "hf:", ["datafile="]) except getopt.GetoptError: print 'index.py -f <datafile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'index.py -f <datafile>' sys.exit() elif opt in ("-f", "--datafile"): datafile = arg data = np.genfromtxt(datafile, delimiter=',') (m, n) = data.shape pids = data[:, 0] scores = data[:, 1:n] # kgraph requires the number of columns to be multiple of 4 needed = 4 - (n - 1) % 4 padded = np.zeros((m, n - 1 + needed)) padded[:, :-needed] = scores index = kg.KGraph() index.build(padded) index.save(datafile + '.index')
def build(self, ntrees): np_dataset = np.array( self.dataset, dtype=np.float32) #kgraph only supports float32 and float 64 self.index = pykgraph.KGraph( np_dataset, 'euclidean') #it is not optimized for float64 self.index.build(reverse=-1, K=self.config.k) self.ntrees = ntrees
def construct_graph_kgraph(data, k): import pykgraph n = len(data) spmat = lil_matrix((n, n)) index = pykgraph.KGraph(data, 'euclidean') index.build(reverse=0, K=2 * k + 1, L=2 * k + 50) result = index.search(data, K=k + 1)[:, 1:] spmat[np.repeat(np.arange(n), k, 0), result.ravel()] = 1 return spmat.tocsr()
def fit(self, X): X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') self._kgraph = pykgraph.KGraph() self._kgraph.build(X, iterations=30, L=100, delta=0.002, recall=0.99, K=25) self._X = X # ???
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) self._kgraph = pykgraph.KGraph(X, self._metric) self._kgraph.build( reverse=0, K=self._count, # L must always be > count L=self._count + self._L, recall=self._recall)
def gen_bin(n=dataset_size): name='bin' dataset = np.random.randint(256,size=(n,bin_bytes)).astype(np.uint8) print 'dataset {0} generato'.format(name) np.save(datasetPath+name,dataset) print 'dataset {0} salvato'.format(name) index = pykgraph.KGraph() index.build(dataset) index.save(indexPath+name) print 'indice {0} salvato'.format(name) return index,dataset
def gen_float(n=dataset_size,K=10,L=128,S=30,prune=0): name='float' dataset = np.random.rand(n, sift_bins) print 'dataset {0} generato'.format(name) np.save(datasetPath+name,dataset) print 'dataset {0} salvato'.format(name) index = pykgraph.KGraph() index.build(dataset) index.save(indexPath+name) print 'indice {0} salvato'.format(name) return index,dataset
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) self._kgraph = pykgraph.KGraph(X, self._metric) path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric) if os.path.exists(path): self._kgraph.load(path) else: # iterations=30, L=100, delta=0.002, recall=0.99, K=25) self._kgraph.build(**self._index_params) if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self._kgraph.save(path)
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) #if self._metric == 'angular': # X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') self._kgraph = pykgraph.KGraph(X, self._metric) path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric) if os.path.exists(path): self._kgraph.load(path) else: self._kgraph.build( **self._index_params ) #iterations=30, L=100, delta=0.002, recall=0.99, K=25) if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self._kgraph.save(path)
def load(entry, datafile): data = np.genfromtxt(datafile, delimiter=',') (m, n) = data.shape pids = data[:, 0] scores = data[:, 1:n] # kgraph requires the number of columns to be multiple of 4 needed = 4 - (n - 1) % 4 padded = np.zeros((m, n - 1 + needed)) padded[:, :-needed] = scores index = kg.KGraph() index.load(datafile + ".index") entry['m'] = m entry['n'] = n entry['pids'] = pids entry['needed'] = needed entry['padded'] = padded entry['index'] = index
def load_bin(): name='bin' index = pykgraph.KGraph() dataset = np.load(datasetPath+name+'npy') index.load(indexPath+name) return index,dataset
def load_float(): name='float' index = pykgraph.KGraph() dataset = np.load(datasetPath+name+'.npy') index.load(indexPath+name) return index,dataset
USE_SKLEARN = True def eval(gold, result): assert gold.shape == result.shape N = gold.shape[0] K = gold.shape[1] total = 0 for i in range(N): total += len(set(gold[i]).intersection(result[i])) return 1.0 * total / (N * K) dataset = random.rand(N, D).astype(TYPE) query = random.rand(Q, D).astype(TYPE) index = pykgraph.KGraph(dataset, "euclidean") K = 10 #index.build(reverse=-1) #index.save("index_file"); # load with index.load("index_file"); gold = None if USE_SKLEARN: print("Generating gold standard...") from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute').fit(dataset) start = time.time() distances, gold = nbrs.kneighbors(query) print("Time:", time.time() - start) print("Searching with brute force...")
def main(): cam1_image_list = [] cam2_image_list = [] num = 0 for dirpath, dirnames, filenames in os.walk(cam2_path): for f in filenames: # if num==1000: # break #print f if '.jpg' in f: print(join(dirpath, f)) cam2_image_list.append(join(dirpath, f)) #num+=1 print "cam2 images num:" print len(cam2_image_list) #save_obj(cam2_image_list,"gallery_list") average_register = len(cam2_image_list) // 8 for i in range(8): if i==7: multiprocessing_register.append(cam2_image_list[i*average_register:]) else: multiprocessing_register.append(cam2_image_list[i*average_register:(i+1)*average_register]) del cam2_image_list num = 0 for dirpath, dirnames, filenames in os.walk(cam1_path): for f in filenames: # if num==200: # break # print f if '.jpg' in f: cam1_image_list.append(join(dirpath, f)) #num+=1 print "cam1 images num:" print len(cam1_image_list) #save_obj(cam1_image_list, "query_list") for i in range(len(cam1_image_list)): # len(cam1_image_list)): #print(i) id = cam1_image_list[i].split('/')[-2] candidate_path_id = candidate_path + id # print("mkdir is ",candidate_path_id) if not os.path.exists(str(candidate_path_id)): os.system('mkdir ' + candidate_path_id) candidate_path_ids.append(candidate_path_id) num_match = len(cam1_image_list) // 8 #num_match = len(cam1_image_list) // image_batch print("batch size is:",num_match) for i in range(8): if i == 7: multiprocessing_match.append(cam1_image_list[i * num_match:]) else: multiprocessing_match.append(cam1_image_list[i * num_match:(i + 1) * num_match]) del cam1_image_list gc.collect() # for i in range(num_match): # if i == num_match-1: # multiprocessing_match.append(cam1_image_list[i * num_match:]) # else: # multiprocessing_match.append(cam1_image_list[i * num_match:(i + 1) * num_match]) register_feature = multiprocessing.Manager().dict() # # tic = timeit.default_timer() plist_register = [] for count in range(8): #length=count*average_register p = multiprocessing.Process(target=extract_register,args=(multiprocessing_register[count],count,register_feature,count%4)) #,average_register,register_feature,count_register)) p.start() plist_register.append(p) for p_register in plist_register: p_register.join() #register_feature=load_obj("register111") print("--------------------",len(register_feature)) print("register feature is %d" % len(register_feature)) dict_register=dict(register_feature) cam2_image_list1=dict_register.keys() del register_feature gc.collect() # save_obj(dict_register,"gallery3") # save_obj(cam2_image_list1,"gallery_list3") gc.collect() toc = timeit.default_timer() print('register time: %.2f' % ((toc - tic) * 1000)) tic = timeit.default_timer() match_feature = multiprocessing.Manager().dict() plist_match = [] for j in range(8): p = multiprocessing.Process(target=extract_query, args=(multiprocessing_match[j],j,match_feature,j%4)) # ,match_feature)) #,average_match,count_match,register_out)) p.start() plist_match.append(p) for p_match in plist_match: p_match.join() del plist_match # for j in range(0,num_match,4): # print j # plist_match = [] # process = num_match - j if (j + 4) > num_match else 4 # for i in range(process): # p = multiprocessing.Process(target=extract_query, args=(multiprocessing_match[j+i],i,match_feature,i)) # ,match_feature)) #,average_match,count_match,register_out)) # p.start() # plist_match.append(p) # # for p_match in plist_match: # p_match.join() # del plist_match toc = timeit.default_timer() print(len(match_feature)) dict_match=dict(match_feature) cam1_image_list1=dict_match.keys() # save_obj(dict_match, "query3") # save_obj(cam1_image_list1, "query_list3") del match_feature gc.collect() #save_obj(dict_match,"query2") #exit(0) # match_feature=load_obj("query") # print(type(match_feature)) # print(len(match_feature)) #print(match_feature1) print('match time: %.2f' % ((toc - tic) * 1000)) print("match feature is %d"%len(dict_match)) dataset =np.array(dict_register.values()) del dict_register gc.collect() print("dataset is {},type is {}".format(len(dataset),type(dataset))) query =np.array(dict_match.values()) del dict_match gc.collect() print("query is {},type is {}".format(len(query), type(query))) index = pykgraph.KGraph(dataset, 'euclidean') # another option is 'angular' del dataset gc.collect() index.build(reverse=-1,K=200,L=300,S=30) # #index.save("index_file.txt") tic = timeit.default_timer() knn = index.search(query,K=1) del query gc.collect() toc = timeit.default_timer() print('match time: %.2f' % ((toc - tic) * 1000)) #print('match time: %.2f' % ((toc - tic) * 1000/len(query))) print(len(cam2_image_list1)) print(cam1_image_list1[0]) print(len(knn)) print(len(knn[0])) for i,index in enumerate(knn): print(i,index[0]) #print('cp ' + cam2_image_list[index[0]] + ' ' +candidate_path+cam1_image_list[i].split('/')[-2]+"/") os.system('cp ' + cam2_image_list1[index[0]] + ' ' +candidate_path+cam1_image_list1[i].split('/')[-2]+"/")
index_sklearn = NearestNeighbors(n_neighbors=TOP_N, algorithm='auto').fit(clipped) logger.info("built sklearn index %s" % index_sklearn._fit_method) gensim.utils.pickle( index_sklearn, sim_prefix + '_sklearn' ) # 32GB RAM not enough to store the scikit-learn model... logger.info("finished sklearn index") log_precision(sklearn_predictions, index_sklearn, queries, index_gensim) sklearn_1by1(index_sklearn, queries) sklearn_at_once(index_sklearn, queries) if 'kgraph' in program: import pykgraph index_kgraph = pykgraph.KGraph() if os.path.exists(sim_prefix + "_kgraph"): logger.info("loading kgraph index") index_kgraph.load(sim_prefix + "_kgraph") else: logger.info("building kgraph index") index_kgraph.build(clipped) logger.info("built kgraph index") index_kgraph.save(sim_prefix + "_kgraph") logger.info("finished kgraph index") global dataset dataset = clipped log_precision(kgraph_predictions, index_kgraph, queries, index_gensim) kgraph_1by1(index_kgraph, queries, clipped) kgraph_at_once(index_kgraph, queries, clipped)
USE_SKLEARN = True def eval(gold, result): assert gold.shape == result.shape N = gold.shape[0] K = gold.shape[1] total = 0 for i in range(N): total += len(set(gold[i]).intersection(result[i])) return 1.0 * total / (N * K) dataset = random.rand(N, D).astype(TYPE) query = random.rand(Q, D).astype(TYPE) index = pykgraph.KGraph() K = 10 #index.build(dataset) #index.save("index_file"); # load with index.load("index_file"); gold = None if USE_SKLEARN: print "Generating gold standard..." from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute').fit(dataset) start = time.time() distances, gold = nbrs.kneighbors(query) print "Time:", time.time() - start print "Searching with brute force..."
from numpy import random import pykgraph import numpy as np dataset = random.rand(100, 16) print(type(dataset)) buf = [0 for x in range(10)] query = [dataset[i] for i in buf] query = np.array(query) index = pykgraph.KGraph(dataset, 'euclidean') # another option is 'angular' index.build(reverse=-1) # #index.save("index_file") # load with index.load("index_file"); knn = index.search(query, K=1) print(type(knn)) print(knn)