Example #1
0
File: test.py Project: caomw/kgraph
def bin_test(n_q=[query_size],n_dts=dataset_size,K=10,L=128,S=30,prune=0):
    name='bin'
    d = np.random.randint(256,size=(n_dts,bin_bytes)).astype(np.uint8)
    i = pykgraph.KGraph()
    start= time.time()
        
    end= time.time()
    buildTime=end-start
    i.save('tempIndex')
    stat = os.stat('tempIndex')
    indexSize=stat.st_size
    q=gen_query_bin(n_q)
    start= time.time()
    i.search(d,q,K=K)
    end= time.time()
    searchTime=end-start
    req={}
    req['querySize']=n_q
    req['datasetSize']=n_dts
    req['K']=K
    req['L']=L
    req['S']=S
    req['prune']=prune
    res={}
    res['buildTime']=buildTime
    res['unitBuildTime']=buildTime/n_dts
    res['searchTime']=searchTime
    res['unitSearchTime']=searchTime/n_q
    res['indexSize']=indexSize
    res['unitIndexSize']=indexSize/n_dts
    return {'request': req , 'response': res}
Example #2
0
def main(argv):
    # A csv filename
    datafile = ''
    try:
        opts, args = getopt.getopt(argv, "hf:", ["datafile="])
    except getopt.GetoptError:
        print 'index.py -f <datafile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'index.py -f <datafile>'
            sys.exit()
        elif opt in ("-f", "--datafile"):
            datafile = arg

    data = np.genfromtxt(datafile, delimiter=',')
    (m, n) = data.shape

    pids = data[:, 0]
    scores = data[:, 1:n]

    # kgraph requires the number of columns to be multiple of 4
    needed = 4 - (n - 1) % 4
    padded = np.zeros((m, n - 1 + needed))
    padded[:, :-needed] = scores

    index = kg.KGraph()
    index.build(padded)
    index.save(datafile + '.index')
Example #3
0
 def build(self, ntrees):
     np_dataset = np.array(
         self.dataset,
         dtype=np.float32)  #kgraph only supports float32 and float 64
     self.index = pykgraph.KGraph(
         np_dataset, 'euclidean')  #it is not optimized for float64
     self.index.build(reverse=-1, K=self.config.k)
     self.ntrees = ntrees
Example #4
0
def construct_graph_kgraph(data, k):
    import pykgraph

    n = len(data)
    spmat = lil_matrix((n, n))
    index = pykgraph.KGraph(data, 'euclidean')
    index.build(reverse=0, K=2 * k + 1, L=2 * k + 50)
    result = index.search(data, K=k + 1)[:, 1:]
    spmat[np.repeat(np.arange(n), k, 0), result.ravel()] = 1
    return spmat.tocsr()
 def fit(self, X):
     X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
     self._kgraph = pykgraph.KGraph()
     self._kgraph.build(X,
                        iterations=30,
                        L=100,
                        delta=0.002,
                        recall=0.99,
                        K=25)
     self._X = X  # ???
Example #6
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     self._kgraph = pykgraph.KGraph(X, self._metric)
     self._kgraph.build(
         reverse=0,
         K=self._count,
         # L must always be > count
         L=self._count + self._L,
         recall=self._recall)
Example #7
0
File: test.py Project: caomw/kgraph
def gen_bin(n=dataset_size):
    name='bin'
    dataset = np.random.randint(256,size=(n,bin_bytes)).astype(np.uint8)
    print 'dataset {0} generato'.format(name)
    np.save(datasetPath+name,dataset)
    print 'dataset {0} salvato'.format(name)
    index = pykgraph.KGraph()
    index.build(dataset)
    index.save(indexPath+name)
    print 'indice {0} salvato'.format(name)
    return index,dataset
Example #8
0
File: test.py Project: caomw/kgraph
def gen_float(n=dataset_size,K=10,L=128,S=30,prune=0):
    name='float'
    dataset = np.random.rand(n, sift_bins)
    print 'dataset {0} generato'.format(name)
    np.save(datasetPath+name,dataset)
    print 'dataset {0} salvato'.format(name)
    index = pykgraph.KGraph()
    index.build(dataset)
    index.save(indexPath+name)
    print 'indice {0} salvato'.format(name)
    return index,dataset
Example #9
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     self._kgraph = pykgraph.KGraph(X, self._metric)
     path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric)
     if os.path.exists(path):
         self._kgraph.load(path)
     else:
         # iterations=30, L=100, delta=0.002, recall=0.99, K=25)
         self._kgraph.build(**self._index_params)
         if not os.path.exists(INDEX_DIR):
             os.makedirs(INDEX_DIR)
         self._kgraph.save(path)
Example #10
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     #if self._metric == 'angular':
     #    X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
     self._kgraph = pykgraph.KGraph(X, self._metric)
     path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric)
     if os.path.exists(path):
         self._kgraph.load(path)
     else:
         self._kgraph.build(
             **self._index_params
         )  #iterations=30, L=100, delta=0.002, recall=0.99, K=25)
         if not os.path.exists(INDEX_DIR):
             os.makedirs(INDEX_DIR)
         self._kgraph.save(path)
Example #11
0
def load(entry, datafile):
    data = np.genfromtxt(datafile, delimiter=',')
    (m, n) = data.shape

    pids = data[:, 0]
    scores = data[:, 1:n]

    # kgraph requires the number of columns to be multiple of 4
    needed = 4 - (n - 1) % 4
    padded = np.zeros((m, n - 1 + needed))
    padded[:, :-needed] = scores

    index = kg.KGraph()
    index.load(datafile + ".index")

    entry['m'] = m
    entry['n'] = n
    entry['pids'] = pids
    entry['needed'] = needed
    entry['padded'] = padded
    entry['index'] = index
Example #12
0
File: test.py Project: caomw/kgraph
def load_bin():
    name='bin'
    index = pykgraph.KGraph()
    dataset = np.load(datasetPath+name+'npy')
    index.load(indexPath+name)
    return index,dataset
Example #13
0
File: test.py Project: caomw/kgraph
def load_float():
    name='float'
    index = pykgraph.KGraph()
    dataset = np.load(datasetPath+name+'.npy')
    index.load(indexPath+name)
    return index,dataset
Example #14
0
USE_SKLEARN = True


def eval(gold, result):
    assert gold.shape == result.shape
    N = gold.shape[0]
    K = gold.shape[1]
    total = 0
    for i in range(N):
        total += len(set(gold[i]).intersection(result[i]))
    return 1.0 * total / (N * K)


dataset = random.rand(N, D).astype(TYPE)
query = random.rand(Q, D).astype(TYPE)
index = pykgraph.KGraph(dataset, "euclidean")
K = 10
#index.build(reverse=-1)
#index.save("index_file");
# load with index.load("index_file");

gold = None
if USE_SKLEARN:
    print("Generating gold standard...")
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute').fit(dataset)
    start = time.time()
    distances, gold = nbrs.kneighbors(query)
    print("Time:", time.time() - start)

print("Searching with brute force...")
def main():
    cam1_image_list = []
    cam2_image_list = []
    num = 0
    for dirpath, dirnames, filenames in os.walk(cam2_path):
        for f in filenames:
            # if num==1000:
            #     break
            #print f
            if '.jpg' in f:
                print(join(dirpath, f))
                cam2_image_list.append(join(dirpath, f))
                #num+=1
    print "cam2 images num:"
    print len(cam2_image_list)
    #save_obj(cam2_image_list,"gallery_list")
    average_register = len(cam2_image_list) // 8

    for i in range(8):
        if i==7:
            multiprocessing_register.append(cam2_image_list[i*average_register:])
        else:
            multiprocessing_register.append(cam2_image_list[i*average_register:(i+1)*average_register])
    del cam2_image_list
    num = 0
    for dirpath, dirnames, filenames in os.walk(cam1_path):
        for f in filenames:
            # if num==200:
            #     break
            # print f
            if '.jpg' in f:
                cam1_image_list.append(join(dirpath, f))
                #num+=1
    print "cam1 images num:"
    print len(cam1_image_list)
    #save_obj(cam1_image_list, "query_list")
    for i in range(len(cam1_image_list)):  # len(cam1_image_list)):
        #print(i)
        id = cam1_image_list[i].split('/')[-2]
        candidate_path_id = candidate_path + id
        # print("mkdir is ",candidate_path_id)
        if not os.path.exists(str(candidate_path_id)):
            os.system('mkdir ' + candidate_path_id)
        candidate_path_ids.append(candidate_path_id)
    num_match = len(cam1_image_list) // 8
    #num_match = len(cam1_image_list) // image_batch


    print("batch size is:",num_match)
    for i in range(8):
        if i == 7:
            multiprocessing_match.append(cam1_image_list[i * num_match:])
        else:
            multiprocessing_match.append(cam1_image_list[i * num_match:(i + 1) * num_match])
    del cam1_image_list
    gc.collect()
    # for i in range(num_match):
    #     if i == num_match-1:
    #         multiprocessing_match.append(cam1_image_list[i * num_match:])
    #     else:
    #         multiprocessing_match.append(cam1_image_list[i * num_match:(i + 1) * num_match])
    register_feature = multiprocessing.Manager().dict()
    #
    #
    tic = timeit.default_timer()
    plist_register = []
    for count in range(8):
        #length=count*average_register
        p = multiprocessing.Process(target=extract_register,args=(multiprocessing_register[count],count,register_feature,count%4)) #,average_register,register_feature,count_register))
        p.start()
        plist_register.append(p)

    for p_register in plist_register:
        p_register.join()
    #register_feature=load_obj("register111")
    print("--------------------",len(register_feature))
    print("register feature is %d" % len(register_feature))
    dict_register=dict(register_feature)
    cam2_image_list1=dict_register.keys()
    del register_feature
    gc.collect()
    # save_obj(dict_register,"gallery3")
    # save_obj(cam2_image_list1,"gallery_list3")
    gc.collect()
    toc = timeit.default_timer()
    print('register time: %.2f' % ((toc - tic) * 1000))
    tic = timeit.default_timer()
    match_feature = multiprocessing.Manager().dict()
    plist_match = []
    for j in range(8):
        p = multiprocessing.Process(target=extract_query, args=(multiprocessing_match[j],j,match_feature,j%4))  # ,match_feature)) #,average_match,count_match,register_out))
        p.start()
        plist_match.append(p)

    for p_match in plist_match:
        p_match.join()
    del plist_match
    # for j in range(0,num_match,4):
    #     print j
    #     plist_match = []
    #     process = num_match - j if (j + 4) > num_match else 4
    #     for i in range(process):
    #         p = multiprocessing.Process(target=extract_query, args=(multiprocessing_match[j+i],i,match_feature,i))  # ,match_feature)) #,average_match,count_match,register_out))
    #         p.start()
    #         plist_match.append(p)
    #
    #     for p_match in plist_match:
    #         p_match.join()
    #     del plist_match
    toc = timeit.default_timer()
    print(len(match_feature))
    dict_match=dict(match_feature)
    cam1_image_list1=dict_match.keys()
    # save_obj(dict_match, "query3")
    # save_obj(cam1_image_list1, "query_list3")
    del match_feature
    gc.collect()
    #save_obj(dict_match,"query2")
    #exit(0)
    # match_feature=load_obj("query")
    # print(type(match_feature))
    # print(len(match_feature))
    #print(match_feature1)

    print('match time: %.2f' % ((toc - tic) * 1000))
    print("match feature is %d"%len(dict_match))
    dataset =np.array(dict_register.values())
    del dict_register
    gc.collect()
    print("dataset is {},type is {}".format(len(dataset),type(dataset)))
    query =np.array(dict_match.values())
    del dict_match
    gc.collect()
    print("query is {},type is {}".format(len(query), type(query)))

    index = pykgraph.KGraph(dataset, 'euclidean')  # another option is 'angular'
    del dataset
    gc.collect()
    index.build(reverse=-1,K=200,L=300,S=30)  #
    #index.save("index_file.txt")
    tic = timeit.default_timer()
    knn = index.search(query,K=1)
    del query
    gc.collect()
    toc = timeit.default_timer()
    print('match time: %.2f' % ((toc - tic) * 1000))
    #print('match time: %.2f' % ((toc - tic) * 1000/len(query)))
    print(len(cam2_image_list1))
    print(cam1_image_list1[0])
    print(len(knn))
    print(len(knn[0]))
    for i,index in enumerate(knn):
        print(i,index[0])
        #print('cp ' + cam2_image_list[index[0]] + ' ' +candidate_path+cam1_image_list[i].split('/')[-2]+"/")
        os.system('cp ' + cam2_image_list1[index[0]] + ' ' +candidate_path+cam1_image_list1[i].split('/')[-2]+"/")
Example #16
0
            index_sklearn = NearestNeighbors(n_neighbors=TOP_N,
                                             algorithm='auto').fit(clipped)
            logger.info("built sklearn index %s" % index_sklearn._fit_method)
            gensim.utils.pickle(
                index_sklearn, sim_prefix + '_sklearn'
            )  # 32GB RAM not enough to store the scikit-learn model...
        logger.info("finished sklearn index")

        log_precision(sklearn_predictions, index_sklearn, queries,
                      index_gensim)
        sklearn_1by1(index_sklearn, queries)
        sklearn_at_once(index_sklearn, queries)

    if 'kgraph' in program:
        import pykgraph
        index_kgraph = pykgraph.KGraph()
        if os.path.exists(sim_prefix + "_kgraph"):
            logger.info("loading kgraph index")
            index_kgraph.load(sim_prefix + "_kgraph")
        else:
            logger.info("building kgraph index")
            index_kgraph.build(clipped)
            logger.info("built kgraph index")
            index_kgraph.save(sim_prefix + "_kgraph")
        logger.info("finished kgraph index")

        global dataset
        dataset = clipped
        log_precision(kgraph_predictions, index_kgraph, queries, index_gensim)
        kgraph_1by1(index_kgraph, queries, clipped)
        kgraph_at_once(index_kgraph, queries, clipped)
Example #17
0
USE_SKLEARN = True


def eval(gold, result):
    assert gold.shape == result.shape
    N = gold.shape[0]
    K = gold.shape[1]
    total = 0
    for i in range(N):
        total += len(set(gold[i]).intersection(result[i]))
    return 1.0 * total / (N * K)


dataset = random.rand(N, D).astype(TYPE)
query = random.rand(Q, D).astype(TYPE)
index = pykgraph.KGraph()
K = 10
#index.build(dataset)
#index.save("index_file");
# load with index.load("index_file");

gold = None
if USE_SKLEARN:
    print "Generating gold standard..."
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute').fit(dataset)
    start = time.time()
    distances, gold = nbrs.kneighbors(query)
    print "Time:", time.time() - start

print "Searching with brute force..."
Example #18
0
from numpy import random
import pykgraph
import numpy as np
dataset = random.rand(100, 16)
print(type(dataset))
buf = [0 for x in range(10)]
query = [dataset[i] for i in buf]
query = np.array(query)
index = pykgraph.KGraph(dataset, 'euclidean')  # another option is 'angular'
index.build(reverse=-1)  #
#index.save("index_file")
# load with index.load("index_file");

knn = index.search(query, K=1)
print(type(knn))
print(knn)