Esempio n. 1
0
 def fit(self, X):
     print('PANNG: start indexing...')
     t0 = time.time()
     dim = len(X[0])
     print('PANNG: # of data=' + str(len(X)))
     print('PANNG: Dimensionality=' + str(dim))
     index_dir = 'indexes'
     if not os.path.exists(index_dir):
         os.makedirs(index_dir)
     index = os.path.join(index_dir, 'NGT-' + str(self._edge_size))
     print(index)
     if os.path.exists(index):
         print('PANNG: index already exists! ' + str(index))
         self.index = ngtpy.Index(index)
         opentime = time.time() - t0
         print('PANNG: open time(sec)=' + str(opentime))
     else:
         ngtpy.create(path=index,
                      dimension=dim,
                      edge_size_for_creation=self._edge_size,
                      distance_type=self._metric,
                      object_type=self._object_type)
         idx = ngtpy.Index(path=index)
         idx.batch_insert(X, num_threads=24, debug=False)
         idx.save(index)
         idx.close()
         if self._pathadj_size > 0:
             print('PANNG: path adjustment')
             args = ['ngt', 'prune', '-s ' + str(self._pathadj_size), index]
             subprocess.call(args)
         self.index = ngtpy.Index(path=index)
         indexingtime = time.time() - t0
         print('PANNG: indexing, adjustment and saving time(sec)=' +
               str(indexingtime))
Esempio n. 2
0
 def fit(self, X):
     print('ONNG: start indexing...')
     dim = len(X[0])
     print('ONNG: # of data=' + str(len(X)))
     print('ONNG: dimensionality=' + str(dim))
     index_dir = 'indexes'
     if not os.path.exists(index_dir):
         os.makedirs(index_dir)
     index = os.path.join(
         index_dir,
         'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree,
                                self._indegree))
     anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size))
     print('ONNG: index=' + index)
     if (not os.path.exists(index)) and (not os.path.exists(anngIndex)):
         print('ONNG: create ANNG')
         t = time.time()
         args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of',
                 '-D' + self._metric, '-d' + str(dim),
                 '-E' + str(self._edge_size),
                 '-S' + str(self._edge_size_for_search),
                 '-e' + str(self._epsilon), '-P0', '-B30',
                 '-T' + str(self._build_time_limit), anngIndex]
         subprocess.call(args)
         idx = ngtpy.Index(path=anngIndex)
         idx.batch_insert(X, num_threads=24, debug=False)
         print('ONNG: ANNG construction time(sec)=' + str(time.time() - t))
         t = time.time()
         if self._refine_enabled:
             idx.refine_anng(epsilon=self._epsilon, num_of_edges=self._edge_size,
                             num_of_explored_edges=self._edge_size_for_search)
         print('ONNG: RNNG construction time(sec)=' + str(time.time() - t))
         idx.save()
         idx.close()
     if not os.path.exists(index):
         print('ONNG: degree adjustment')
         t = time.time()
         args = ['ngt', 'reconstruct-graph', '-mS',
                 '-o ' + str(self._outdegree),
                 '-i ' + str(self._indegree), anngIndex, index]
         subprocess.call(args)
         print('ONNG: degree adjustment time(sec)=' + str(time.time() - t))
     if os.path.exists(index):
         print('ONNG: index already exists! ' + str(index))
         t = time.time()
         print(self._tree_disabled)
         self.index = ngtpy.Index(index, read_only=True, tree_disabled=self._tree_disabled)
         self.indexName = index
         print('ONNG: open time(sec)=' + str(time.time() - t))
     else:
         print('ONNG: something wrong.')
     print('ONNG: end of fit')
def create_ANN(e_id, pos_data, neg_data, is_extend):
    if is_extend:
        ngtpy.create(path=str(e_id) + '_extend.anng', dimension=768, distance_type="L2")
        index = ngtpy.Index(str(e_id) + '_extend.anng')
    else:
        ngtpy.create(path=str(e_id) + '.anng', dimension=768, distance_type="L2")
        index = ngtpy.Index(str(e_id) + '.anng')
    nX1 = np.array(list(pos_data['vec']))
    nX2 = np.array(list(neg_data['vec']))
    objects = np.concatenate((nX1, nX2))
    index.batch_insert(objects)
    index.build_index()
    y = np.concatenate((np.ones(len(nX1), dtype=int), np.zeros(len(nX2), dtype=int)))
    return index, y
Esempio n. 4
0
def search_NN_ngt(test_emb, train_emb_flat, NN=1):
    import ngtpy

    Ntest, I, J, D = test_emb.shape
    closest_inds = np.empty((Ntest, I, J, NN), dtype=np.int32)
    l2_maps = np.empty((Ntest, I, J, NN), dtype=np.float32)

    # os.makedirs('tmp', exist_ok=True)
    dpath = f'/tmp/{os.getpid()}'
    ngtpy.create(dpath, D)
    index = ngtpy.Index(dpath)
    index.batch_insert(train_emb_flat)

    for n in range(Ntest):
        for i in range(I):
            for j in range(J):
                query = test_emb[n, i, j, :]
                results = index.search(query, NN)
                inds = [result[0] for result in results]

                closest_inds[n, i, j, :] = inds
                vecs = np.asarray(
                    [index.get_object(inds[nn]) for nn in range(NN)])
                dists = np.linalg.norm(query - vecs, axis=-1)
                l2_maps[n, i, j, :] = dists
    shutil.rmtree(dpath)

    return l2_maps, closest_inds
Esempio n. 5
0
    def ngt(self, train_data, train_labels, test_data, test_labels):
        """
    Run NGT. Compute training and testing times

    [1] https://github.com/yahoojapan/NGT
    """
        train_time = 0
        time0 = time.time()

        ngtpy.create(b"data",
                     len(train_data[0]),
                     distance_type=self.ngt_distance)
        model = ngtpy.Index(b"data")
        model.batch_insert(train_data)
        time1 = time.time()
        model.save()
        predicted_labels = self.ngt_predict(model, test_data, train_labels)

        if self.test_time_ms == None:
            time2 = time.time()
            self.ngt_predict(model, train_data[0:self.test_time_samples],
                             train_labels)
            time3 = time.time()
            self.test_time_ms = 1000.0 * (time3 -
                                          time2) / (self.test_time_samples)

        train_time = time1 - time0
        return predicted_labels, train_time
Esempio n. 6
0
def test_simple_ngt(tmpdir):
    import ngtpy
    path = os.path.join(tmpdir, 'ngt-index')
    dimension, queries, top_k, batch_size, num_batch = 10, 3, 5, 8, 3

    ngtpy.create(path=path, dimension=dimension, distance_type='L2')
    _index = ngtpy.Index(path=path)
    for i in range(num_batch):
        _index.batch_insert(np.random.random((batch_size, dimension)),
                            num_threads=4)
    assert os.path.exists(path)

    idx = []
    dist = []
    for key in np.random.random((queries, dimension)):
        results = _index.search(key, size=top_k, epsilon=0.1)
        index_k = []
        distance_k = []
        [(index_k.append(result[0]), distance_k.append(result[1]))
         for result in results]
        idx.append(index_k)
        dist.append(distance_k)

    idx = np.array(idx)
    dist = np.array(dist)

    assert idx.shape == dist.shape
    assert idx.shape == (queries, top_k)
    def __init__(self, db, f_class=None, d_type='L1'):
        self.NGT_dir = 'NGT_{}_{}'.format(f_class,d_type)
        self.NGT_path = b''
        self.fearure = f_class
        self.SQLdb = SQLite()

        if f_class == 'daisy':
            self.f_c = Daisy()
            self.NGT_path = b'NGT/NGT_daisy_'+d_type.encode()
        elif f_class == 'edge':
            self.f_c = Edge()
            self.NGT_path = b'NGT/NGT_edge_'+d_type.encode()
        elif f_class == 'hog':
            self.f_c = HOG()
            self.NGT_path = b'NGT/NGT_hog_'+d_type.encode()
        elif f_class == 'vgg':
            self.f_c = VGGNetFeat()
            self.NGT_path = b'NGT/NGT_vgg_'+d_type.encode()
        elif f_class == 'res':
            self.f_c = ResNetFeat()
            self.NGT_path = b'NGT/NGT_res_'+d_type.encode()
        if not os.path.exists(os.path.join(NGT_dir,self.NGT_dir)):
                samples = self.f_c.make_samples(db, verbose=False)
                dim = 0
                try: 
                    dim = samples[0]['hist'].shape[0]
                except:
                    pass
                images= []
                objects = []
                for i, row in enumerate(samples):
                    vector  = row['hist']
                    link    = row['img']
                    lable   = row['cls']
                    data = {'index':i,'link':link,'lable':lable}
                    images.append(data)
                    objects.append(vector)
                self.SQLdb.updateMuti(f_class,images)

                # cPickle.dump(images, open(os.path.join(NGT_dir, sample_cache), "wb", True))
                ngtpy.create(path=self.NGT_path, dimension=dim, distance_type=d_type)
                self.index = ngtpy.Index(self.NGT_path)
                self.index.batch_insert(objects)
                self.index.save()

        self.index  = ngtpy.Index(self.NGT_path) 
Esempio n. 8
0
 def __init__(self, dims, edge_size_for_search=40, epsilon=0.1):
   self.dims = dims
   self.vectors_map = {}
   index_path = 'livemint.anng'
   ngtpy.create(index_path, dims, edge_size_for_search=edge_size_for_search)  # create an empty index
   self.index = ngtpy.Index(index_path)  # open the index
   self.content_id_to_ngt_id = {}
   self.epsilon = epsilon
Esempio n. 9
0
 def build_advanced_index(self, vecs: 'np.ndarray'):
     import ngtpy
     ngtpy.create(path=self.index_path,
                  dimension=self.num_dim,
                  distance_type=self.metric)
     _index = ngtpy.Index(self.index_path)
     _index.batch_insert(vecs, num_threads=1)
     return _index
 def fit(self, X):
     print('QG: start indexing...')
     dim = len(X[0])
     print('QG: # of data=' + str(len(X)))
     print('QG: dimensionality=' + str(dim))
     index_dir = 'indexes'
     if not os.path.exists(index_dir):
         os.makedirs(index_dir)
     index = os.path.join(
         index_dir, 'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree,
                                           self._indegree))
     anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size))
     print('QG: index=' + index)
     if (not os.path.exists(index)) and (not os.path.exists(anngIndex)):
         print('QG: create ANNG')
         t = time.time()
         args = [
             'ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of',
             '-D' + self._metric, '-d' + str(dim),
             '-E' + str(self._edge_size), '-S40', '-e' + str(self._epsilon),
             '-P0', '-B30', '-T' + str(self._build_time_limit), anngIndex
         ]
         subprocess.call(args)
         idx = ngtpy.Index(path=anngIndex)
         idx.batch_insert(X, num_threads=24, debug=False)
         idx.save()
         idx.close()
         print('QG: ANNG construction time(sec)=' + str(time.time() - t))
     if not os.path.exists(index):
         print('QG: degree adjustment')
         t = time.time()
         args = [
             'ngt', 'reconstruct-graph', '-mS',
             '-E ' + str(self._outdegree), '-o ' + str(self._outdegree),
             '-i ' + str(self._indegree), anngIndex, index
         ]
         subprocess.call(args)
         print('QG: degree adjustment time(sec)=' + str(time.time() - t))
     if not os.path.exists(index + '/qg'):
         print('QG: quantization')
         t = time.time()
         args = [
             'ngtqg', 'quantize', '-C1', '-c16', '-N ' + str(dim), '-Ms',
             '-lk', index
         ]
         subprocess.call(args)
         print('QG: quantization time(sec)=' + str(time.time() - t))
     if os.path.exists(index):
         print('QG: index already exists! ' + str(index))
         t = time.time()
         self.index = ngtpy.QuantizedIndex(index, self._max_edge_size)
         self.index.set_with_distance(False)
         self.indexName = index
         print('QG: open time(sec)=' + str(time.time() - t))
     else:
         print('QG: something wrong.')
     print('QG: end of fit')
Esempio n. 11
0
    def _load_index(self):
        """Create or open the NGT index."""

        path = os.path.join(self.data_directory, "ngt")

        if not os.path.exists(path):
            ngtpy.create(path, dimension=300)  # Spacy word vectors are 300D

        return ngtpy.Index(path)
Esempio n. 12
0
    def get_query_handler(self):
        """Index all vectors , if already indexed return NGT Index handle """

        import ngtpy
        vecs = super().get_query_handler()
        if vecs is not None:
            ngtpy.create(path=self.index_path, dimension=self.num_dim, distance_type=self.metric)
            _index = ngtpy.Index(self.index_path)
            _index.batch_insert(vecs, num_threads=self.num_threads)
            return _index
        else:
            return None
Esempio n. 13
0
 def __init__(self, data):
     import tempfile, ngtpy
     data = np.asarray(data)
     self.index_directory = tempfile.TemporaryDirectory()
     self.index_name = bytes(self.index_directory.name, 'ascii')
     # Can manually delete the index file with:
     #   tree = NGT( <data> )
     #   tree.index_directory.cleanup()
     ngtpy.create(self.index_name, data.shape[1])
     self.index = ngtpy.Index(self.index_name)
     self.index.batch_insert(data)
     self.index.save()
Esempio n. 14
0
def main():
    dim = len(idolvecs[1][0])
    ngtpy.create(b"tmp", dim)
    index = ngtpy.Index(b"tmp")
    index.batch_insert(idolvecs[1])
    index.save()

    i = int(sys.argv[2])
    target_name = idolvecs[0][i]
    target_vec = idolvecs[1][i]
    print(target_name)

    similar_v(index, target_vec)
 def fit(self, X):
     print('ONNG: start indexing...')
     dim = len(X[0])
     print('ONNG: # of data=' + str(len(X)))
     print('ONNG: dimensionality=' + str(dim))
     index_dir = 'indexes'
     if not os.path.exists(index_dir):
         os.makedirs(index_dir)
     index = os.path.join(index_dir, 'ONNG-' + str(self._edge_size) + '-' + str(self._outdegree) + '-' + str(self._indegree))
     anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size))
     print('ONNG: index=' + index)
     if (not os.path.exists(index)) and (not os.path.exists(anngIndex)):
         print('ONNG: create ANNG')
         t = time.time()
         args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', '-D' + self._metric, '-d' + str(dim), '-E' + str(self._edge_size), '-S0', '-e' + str(self._epsilon), '-P0', '-B30', anngIndex]
         subprocess.call(args)
         idx = ngtpy.Index(path=anngIndex)
         idx.batch_insert(X, num_threads=1, debug=False)
         idx.save()
         idx.close()
         print('ONNG: ANNG construction time(sec)=' + str(time.time() - t))
     if not os.path.exists(index):
         print('ONNG: degree adjustment')
         t = time.time()
         args = ['ngt', 'reconstruct-graph', '-mS', '-o ' + str(self._outdegree), '-i ' + str(self._indegree), anngIndex, index]
         subprocess.check_call(args)
         print('ONNG: degree adjustment time(sec)=' + str(time.time() -t))
     if os.path.exists(index):
         print('ONNG: index already exists! ' + str(index))
         t = time.time()
         self.index = ngtpy.Index(index, read_only=True)
         self.indexName = index
         print('ONNG: open time(sec)=' + str(time.time() - t))
     else:
         print('ONNG: something wrong.')
     print('ONNG: end of fit')
Esempio n. 16
0
    def build_advanced_index(self, vecs: 'np.ndarray'):
        """
        Build an advanced index structure from a numpy array.

        :param vecs: numpy array containing the vectors to index
        :return: advanced NGT index
        """

        import ngtpy
        ngtpy.create(path=self.index_path,
                     dimension=self.num_dim,
                     distance_type=self._metric)
        _index = ngtpy.Index(self.index_path)
        _index.batch_insert(vecs, num_threads=self._num_threads)
        return _index
Esempio n. 17
0
def ANN_cal(e_id, vec, y):
    index = ngtpy.Index(str(e_id) + '.anng')
    label = []
    for i in vec:
        results = index.search(i, size=8)
        sum = 0
        for j in results:
            sum += j[1]
        if sum == 0:
            pos = 0
            neg = 1
        else:
            pos = 0
            neg = 0
            for j in results:
                if y[j[0]] == 1:
                    pos += 1 - j[1] / sum
                else:
                    neg += 1 - j[1] / sum
        if pos > neg:
            label.append(1)
        else:
            label.append(0)
    return label
Esempio n. 18
0
f = open(q_embedding_file, "rb+")
all_question_embedding = list((pickle.load(f)).values())
f.close()

train_question_embedding = all_question_embedding[:-5000]
eval_question_embedding = all_question_embedding[-5000:]

f = open(qa_embedding_file, "rb+")
all_question_answer_embedding = list((pickle.load(f)).values())
f.close()

train_qa_embedding = all_question_answer_embedding[:-5000]
eval_qa_embedding = all_question_answer_embedding[-5000:]

ngtpy.create(b"evaluation_questions_index", len(all_question_embedding[0]))
index = ngtpy.Index(b"evaluation_questions_index")
index.batch_insert(train_question_embedding)
index.save()
total_cosine_similarity = 0.0

QA = open(qa_file, "r").readlines()
eval_qa_pairs = open(eval_qa_file, "r").readlines()

eval_questions = open(eval_q_file, "r").readlines()

print("Start evaluation")

for i in tqdm.tqdm(range(len(eval_question_embedding))):
    question = eval_questions[i]
    q_embedding = eval_question_embedding[i]
    result = index.search(q_embedding, opt.beam)
Esempio n. 19
0
    def dedupe(self, args):
        if not self.load_hashcache():
            self.dump_hashcache()

        # check num_proc
        if args.num_proc is None:
            num_proc = max(cpu_count() - 1, 1)
        else:
            num_proc = args.num_proc

        if self.ngt:
            try:
                import ngtpy
            except:
                logger.error(colored("Error: Unable to load NGT. Please install NGT and python binding first.", 'red'))
                sys.exit(1)
            index_path = self.get_ngt_index_path()
            logger.warning("Building NGT index (dimension={}, num_proc={})".format(self.hash_bits, num_proc))
            ngtpy.create(path=index_path.encode(),
                dimension=self.hash_bits,
                edge_size_for_creation=args.ngt_edges,
                edge_size_for_search=args.ngt_edges_for_search,
                object_type="Byte",
                distance_type="Hamming")
            ngt_index = ngtpy.Index(index_path.encode())
            ngt_index.batch_insert(self.hashcache.hshs(), num_proc)

            # NGT Approximate neighbor search
            logger.warning("Approximate neighbor searching using NGT")
            hshs = self.hashcache.hshs()
            check_list = [0] * len(hshs)
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(len(hshs))):
                    new_group_found = False
                    if check_list[i] != 0:
                        # already grouped image
                        continue
                    for res in ngt_index.search(hshs[i], size=args.ngt_k, epsilon=args.ngt_epsilon):
                        if res[0] == i:
                            continue
                        else:
                            if res[1] <= self.hamming_distance:
                                if check_list[res[0]] == 0:
                                    if check_list[i] == 0:
                                        # new group
                                        new_group_found = True
                                        check_list[i] = current_group_num
                                        check_list[res[0]] = current_group_num
                                        self.group[current_group_num] = [self.image_filenames[i]]
                                        self.group[current_group_num].extend([self.image_filenames[res[0]]])
                                    else:
                                        # exists group
                                        exists_group_num = check_list[i]
                                        check_list[res[0]] = exists_group_num
                                        self.group[exists_group_num].extend([self.image_filenames[res[0]]])
                    if new_group_found:
                        current_group_num += 1
            else: # query image
                new_group_found = False
                hsh = self.hashcache.gen_hash(args.query)
                self.group[current_group_num] = []
                for res in ngt_index.search(hsh, size=args.ngt_k, epsilon=args.ngt_epsilon):
                    if res[1] <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend([self.image_filenames[res[0]]])
                if new_group_found:
                    current_group_num += 1

            # remove ngt index
            if index_path:
                os.system("rm -rf {}".format(index_path))


        elif self.hnsw:
            try:
                import hnswlib
            except:
                logger.error(colored("Error: Unable to load hnsw. Please install hnsw python binding first.", 'red'))
                sys.exit(1)
            hshs = self.hashcache.hshs()
            num_elements = len(hshs)
            hshs_labels = np.arange(num_elements)
            hnsw_index = hnswlib.Index(space='l2', dim=self.hash_bits) # Squared L2
            hnsw_index.init_index(max_elements=num_elements, ef_construction=args.hnsw_ef_construction, M=args.hnsw_m)
            hnsw_index.set_ef(max(args.hnsw_ef, args.hnsw_k - 1)) # ef should always be > k
            hnsw_index.set_num_threads(num_proc)
            logger.warning("Building hnsw index (dimension={}, num_proc={})".format(self.hash_bits, num_proc))
            hnsw_index.add_items(hshs, hshs_labels, num_proc)

            # hnsw Approximate neighbor search
            logger.warning("Approximate neighbor searching using hnsw")
            check_list = [0] * num_elements
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(num_elements)):
                    new_group_found = False
                    if check_list[i] != 0:
                        # already grouped image
                        continue
                    labels, distances = hnsw_index.knn_query(hshs[i], k=args.hnsw_k, num_threads=num_proc)
                    for label, distance in zip(labels[0], distances[0]):
                        if label == i:
                            continue
                        else:
                            if distance <= self.hamming_distance:
                                if check_list[label] == 0:
                                    if check_list[i] == 0:
                                        # new group
                                        new_group_found = True
                                        check_list[i] = current_group_num
                                        check_list[label] = current_group_num
                                        self.group[current_group_num] = [self.image_filenames[i]]
                                        self.group[current_group_num].extend([self.image_filenames[label]])
                                    else:
                                        # exists group
                                        exists_group_num = check_list[i]
                                        check_list[label] = exists_group_num
                                        self.group[exists_group_num].extend([self.image_filenames[label]])
                    if new_group_found:
                        current_group_num += 1
            else: # query image
                new_group_found = False
                hsh = self.hashcache.gen_hash(args.query)
                self.group[current_group_num] = []
                labels, distances = hnsw_index.knn_query(hsh, k=args.hnsw_k, num_threads=num_proc)
                for label, distance in zip(labels[0], distances[0]):
                    if distance <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend([self.image_filenames[label]])
                if new_group_found:
                    current_group_num += 1


        elif self.faiss_flat:
            try:
                import faiss
            except:
                logger.error(colored("Error: Unable to load faiss. Please install faiss python binding first.", 'red'))
                sys.exit(1)
            hshs = self.hashcache.hshs()
            faiss.omp_set_num_threads(num_proc)
            logger.warning("Building faiss index (dimension={}, num_proc={})".format(self.hash_bits, num_proc))
            data = np.array(hshs).astype('float32')
            index = faiss.IndexFlatL2(self.hash_bits) # Exact search
            index.add(data)

            # faiss Exact neighbor search
            logger.warning("Exact neighbor searching using faiss")
            check_list = [0] * index.ntotal
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(index.ntotal)):
                    new_group_found = False
                    if check_list[i] != 0:
                        # already grouped image
                        continue
                    distances, labels = index.search(data[[i]], args.faiss_flat_k)
                    for label, distance in zip(labels[0], distances[0]):
                        if label == i:
                            continue
                        else:
                            if distance <= self.hamming_distance:
                                if check_list[label] == 0:
                                    if check_list[i] == 0:
                                        # new group
                                        new_group_found = True
                                        check_list[i] = current_group_num
                                        check_list[label] = current_group_num
                                        self.group[current_group_num] = [self.image_filenames[i]]
                                        self.group[current_group_num].extend([self.image_filenames[label]])
                                    else:
                                        # exists group
                                        exists_group_num = check_list[i]
                                        check_list[label] = exists_group_num
                                        self.group[exists_group_num].extend([self.image_filenames[label]])
                    if new_group_found:
                        current_group_num += 1
            else: # query image
                new_group_found = False
                hsh = np.array([self.hashcache.gen_hash(args.query)]).astype('float32')
                self.group[current_group_num] = []
                distances, labels = index.search(hsh, args.faiss_flat_k)
                for label, distance in zip(labels[0], distances[0]):
                    if distance <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend([self.image_filenames[label]])
                if new_group_found:
                    current_group_num += 1


        else:
            logger.warning("Searching similar images")
            hshs = self.hashcache.hshs()
            check_list = [0] * len(hshs)
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(len(hshs))):
                    new_group_found = False
                    hshi = hshs[i]
                    for j in range(i+1, len(hshs)):
                        hshj = hshs[j]
                        hamming_distance = np.count_nonzero(hshi != hshj)
                        if hamming_distance <= self.hamming_distance:
                            if check_list[j] == 0:
                                if check_list[i] == 0:
                                    # new group
                                    new_group_found = True
                                    check_list[i] = current_group_num
                                    check_list[j] = current_group_num
                                    self.group[current_group_num] = [self.image_filenames[i]]
                                    self.group[current_group_num].extend([self.image_filenames[j]])
                                else:
                                    # exists group
                                    exists_group_num = check_list[i]
                                    # check hamming distances of exists group
                                    for filename in self.group[exists_group_num]:
                                        h = hshs[self.image_filenames.index(filename)]
                                        hamming_distance = np.count_nonzero(h != hshj)
                                        if not hamming_distance <= self.hamming_distance:
                                            continue
                                    check_list[j] = exists_group_num
                                    self.group[exists_group_num].extend([self.image_filenames[j]])

                    if new_group_found:
                        current_group_num += 1
            else: # query image
                new_group_found = False
                hsh = self.hashcache.gen_hash(args.query)
                self.group[current_group_num] = []
                for i in tqdm(range(len(hshs))):
                    hshi = hshs[i]
                    hamming_distance = np.count_nonzero(hshi != hsh)
                    if hamming_distance <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend([self.image_filenames[i]])
                if new_group_found:
                    current_group_num += 1


        # sort self.group
        self.sort_group()

        # write duplicate log file
        self.num_duplicate_set = current_group_num - 1
        if self.num_duplicate_set > 0 and args.log:
            now = datetime.now().strftime('%Y%m%d%H%M%S')
            duplicate_log_file = "{}_{}".format(now, self.get_duplicate_log_name())
            with open(duplicate_log_file, 'w') as f:
                if args.query:
                    f.write("Query: {}\n".format(args.query))
                for k in range(1, self.num_duplicate_set + 1):
                    img_list = self.group[k]
                    if len(img_list) > 1:
                        sorted_img_list, _, _, _ = self.sort_image_list(img_list)
                        if args.sameline:
                            f.write(" ".join(sorted_img_list) + "\n")
                        else:
                            f.write("\n".join(sorted_img_list) + "\n")
                            if k != len(self.group):
                                f.write("\n")
Esempio n. 20
0
    def kneighbors(
            self,
            X=None,
            n_candidates=None,
            return_distance=True
    ) -> Union[Tuple[np.array, np.array], np.array]:
        """ Retrieve k nearest neighbors.

        Parameters
        ----------
        X: np.array or None, optional, default = None
            Query objects. If None, search among the indexed objects.
        n_candidates: int or None, optional, default = None
            Number of neighbors to retrieve.
            If None, use the value passed during construction.
        return_distance: bool, default = True
            If return_distance, will return distances and indices to neighbors.
            Else, only return the indices.
        """
        check_is_fitted(self, 'index_')
        if X is not None:
            X = check_array(X)

        n_test = self.n_samples_fit_ if X is None else X.shape[0]
        dtype = self.X_dtype_ if X is None else X.dtype

        if n_candidates is None:
            n_candidates = self.n_candidates
        n_candidates = check_n_candidates(n_candidates)

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        if X is None:
            n_neighbors = n_candidates + 1
            start = 1
        else:
            n_neighbors = n_candidates
            start = 0

        # If fewer candidates than required are found for a query,
        # we save index=-1 and distance=NaN
        neigh_ind = -np.ones((n_test, n_candidates), dtype=np.int32)
        if return_distance:
            neigh_dist = np.empty_like(neigh_ind, dtype=dtype) * np.nan

        if isinstance(self.index_, str):
            index = ngtpy.Index(self.index_)
        else:
            index = self.index_

        disable_tqdm = False if self.verbose else True
        if X is None:
            for i in tqdm(
                    range(n_test),
                    desc='Query NNG',
                    disable=disable_tqdm,
            ):
                query = index.get_object(i)
                response = index.search(
                    query=query,
                    size=n_neighbors,
                    with_distance=return_distance,
                    epsilon=self.epsilon,
                )
                if return_distance:
                    ind, dist = [np.array(arr) for arr in zip(*response)]
                else:
                    ind = response
                ind = ind[start:]
                neigh_ind[i, :len(ind)] = ind
                if return_distance:
                    dist = dist[start:]
                    neigh_dist[i, :len(dist)] = dist
        else:  # if X was provided
            for i, x in tqdm(
                    enumerate(X),
                    desc='Query NNG',
                    disable=disable_tqdm,
            ):
                response = index.search(
                    query=x,
                    size=n_neighbors,
                    with_distance=return_distance,
                    epsilon=self.epsilon,
                )
                if return_distance:
                    ind, dist = [np.array(arr) for arr in zip(*response)]
                else:
                    ind = response
                ind = ind[start:]
                neigh_ind[i, :len(ind)] = ind
                if return_distance:
                    dist = dist[start:]
                    neigh_dist[i, :len(dist)] = dist

        if return_distance and self.metric == 'sqeuclidean':
            neigh_dist **= 2

        if return_distance:
            return neigh_dist, neigh_ind
        else:
            return neigh_ind
Esempio n. 21
0
    def fit(self, X, y=None):
        """ Build the ngtpy.Index and insert data from X.

        Parameters
        ----------
        X: np.array
            Data to be indexed
        y: any
            Ignored

        Returns
        -------
        self: NNG
            An instance of NNG with a built index
        """
        if y is None:
            X = check_array(X)
        else:
            X, y = check_X_y(X, y)
            self.y_train_ = y

        self.n_samples_fit_ = X.shape[0]
        self.n_features_ = X.shape[1]
        self.X_dtype_ = X.dtype

        # Map common distance names to names used by ngt
        try:
            self.effective_metric_ = NNG.internal_distance_type[self.metric]
        except KeyError:
            self.effective_metric_ = self.metric
        if self.effective_metric_ not in NNG.valid_metrics:
            raise ValueError(
                f'Unknown distance/similarity measure: {self.effective_metric_}. '
                f'Please use one of: {NNG.valid_metrics}.')

        # Set up a directory to save the index to
        prefix = 'skhubness_'
        suffix = '.anng'
        if self.index_dir in ['auto']:
            index_path = create_tempfile_preferably_in_dir(
                prefix=prefix, suffix=suffix, directory='/dev/shm')
            logging.warning(
                f'The index will be stored in {index_path}. '
                f'It will NOT be deleted automatically, when this instance is destructed.'
            )
        elif isinstance(self.index_dir, str):
            index_path = create_tempfile_preferably_in_dir(
                prefix=prefix, suffix=suffix, directory=self.index_dir)
        elif self.index_dir is None:
            index_path = create_tempfile_preferably_in_dir(prefix=prefix,
                                                           suffix=suffix)
        else:
            raise TypeError(
                f'NNG requires to write an index to the filesystem. '
                f'Please provide a valid path with parameter `index_dir`.')

        # Create the ANNG index, insert data
        ngtpy.create(
            path=index_path,
            dimension=self.n_features_,
            edge_size_for_creation=self.edge_size_for_creation,
            edge_size_for_search=self.edge_size_for_search,
            distance_type=self.effective_metric_,
        )
        index_obj = ngtpy.Index(index_path)
        index_obj.batch_insert(X, num_threads=self.n_jobs)
        index_obj.save()

        # Convert ANNG top ONNG
        if self.optimize:
            optimizer = ngtpy.Optimizer()
            optimizer.set(num_of_outgoings=self.num_outgoing,
                          num_of_incomings=self.num_incoming)
            index_path_onng = str(
                pathlib.Path(index_path).with_suffix('.onng'))
            optimizer.execute(index_path, index_path_onng)
            index_path = index_path_onng

        # Keep index in memory or store in path
        if self.index_dir is None:
            self.index_ = index_obj
        else:
            # index_obj.save()
            self.index_ = index_path

        return self
Esempio n. 22
0
def build_similarity_structure(model_file,
                               viable_lines,
                               n_items,
                               strategy,
                               n_top=10,
                               n_candidates=1,
                               num_trees=None,
                               epsilon=None,
                               df=None):
    t_start = dt.datetime.now()
    most_similar = {}
    c = 1

    nodes = [_.split(' ', maxsplit=1)[0] for _ in viable_lines]
    # viable_lines = []

    if strategy == 'annoy' and ANNOY_NOT_FOUND:
        warnings.warn(
            'Chosen strategy = \'annoy\', but the module is not installed. Falling back to basic.'
        )
        strategy = 'basic'
    if strategy == 'ngt' and NGT_NOT_FOUND:
        warnings.warn(
            'Chosen strategy = \'NGT\', but the module is not installed. Falling back to basic.'
        )
        strategy = 'basic'
    if strategy == 'faiss' and FAISS_NOT_FOUND:
        warnings.warn(
            'Chosen strategy = \'faiss\', but the module is not installed. Falling back to basic.'
        )
        strategy = 'basic'

    if strategy == 'basic':
        model = models.KeyedVectors.load_word2vec_format(
            model_file, unicode_errors='ignore')

        for n in tqdm(nodes):
            ms = model.most_similar(str(n), topn=n_top)
            mm = [item[0] for item in ms]
            idx = int(n.split('__')[1])
            if idx < n_items:
                candidates = [
                    _ for _ in mm if int(_.split('__')[1]) >= n_items
                ]
            else:
                candidates = [_ for _ in mm if int(_.split('__')[1]) < n_items]

            candidates = candidates[:n_candidates]
            most_similar[n] = candidates
            c += 1
        print('')

    elif strategy == 'annoy':
        assert num_trees is not None
        assert type(num_trees) == int
        assert num_trees > 0

        print('Using ANNOY indexing.')
        model = models.KeyedVectors.load_word2vec_format(
            model_file, unicode_errors='ignore')
        annoy_index = AnnoyIndexer(model, num_trees=num_trees)
        for n in tqdm(nodes):
            ms = model.most_similar(str(n), topn=n_top, indexer=annoy_index)
            mm = [item[0] for item in ms]
            idx = int(n.split('__')[1])
            if idx < n_items:
                candidates = [
                    _ for _ in mm if int(_.split('__')[1]) >= n_items
                ]
            else:
                candidates = [_ for _ in mm if int(_.split('__')[1]) < n_items]

            candidates = candidates[:n_candidates]
            most_similar[n] = candidates
            print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'.
                  format(c / len(nodes) * 100, c, len(nodes)),
                  end='')
            c += 1
        print('')

    elif strategy == 'lsh':
        print('Using DeepER LSH blocking.')
        blocking_candidates = execute_blocking(model_file)
        model = models.KeyedVectors.load_word2vec_format(
            model_file, unicode_errors='ignore')
        for n in blocking_candidates:
            ms = []
            bucket = blocking_candidates[n]
            for cand in bucket:
                ms.append((cand, model.similarity(n, cand)))
            ms.sort(key=itemgetter(1), reverse=True)

            mm = [item[0] for item in ms]
            idx = int(n.split('_')[1])
            if idx < n_items:
                candidates = [_ for _ in mm if idx >= n_items]
            else:
                candidates = [_ for _ in mm if idx < n_items]

            candidates = candidates[:n_candidates]
            most_similar[n] = candidates
            print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'.
                  format(c / len(nodes) * 100, c, len(nodes)),
                  end='')
            c += 1
        print('')

    elif strategy == 'ngt':
        assert epsilon is not None
        assert type(epsilon) == float
        assert 0 <= epsilon <= 1

        print('Using NGT indexing.')
        ngt_index_path = 'pipeline/dump/ngt_index.nn'
        words = []
        with open(model_file, 'r') as fp:
            n, dim = map(int, fp.readline().split())
            ngtpy.create(ngt_index_path, dim, distance_type='Cosine')
            index = ngtpy.Index(ngt_index_path)

            for idx, line in enumerate(fp):
                k, v = line.rstrip().split(' ', maxsplit=1)
                vector = list(map(float, v.split(' ')))
                index.insert(vector)  # insert objects
                words.append(k)

        index.build_index()
        index.save()
        most_similar = {}

        for n in tqdm(nodes):
            query = index.get_object(words.index(n))
            ms = index.search(query, size=n_top, epsilon=epsilon)
            mm = [item[0] for item in ms[1:]]
            mm = list(map(words.__getitem__, mm))
            idx = int(n.split('_')[1])
            if idx < n_items:
                candidates = [_ for _ in mm if idx >= n_items]
            else:
                candidates = [_ for _ in mm if idx < n_items]

            candidates = candidates[:n_candidates]
            most_similar[n] = candidates
            print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'.
                  format(c / len(nodes) * 100, c, len(nodes)),
                  end='')
            c += 1
        print('')

    elif strategy == 'faiss':
        print('Using faiss indexing.')
        # ngt_index_path = 'pipeline/dump/ngt_index.nn'
        words = []
        with open(model_file, 'r') as fp:
            n, dim = map(int, fp.readline().split())
            mat = []
            index = faiss.IndexFlatL2(dim)
            for idx, line in enumerate(fp):
                k, v = line.rstrip().split(' ', maxsplit=1)
                vector = np.array(list(map(float, v.split(' '))),
                                  ndmin=1).astype('float32')
                mat.append(vector)
                words.append(k)

        mat = np.array(mat)
        index.add(mat)

        most_similar = {}

        D, I = index.search(mat, k=n_top + 1)
        # D, I = index.search(query, size=n_top, epsilon=epsilon)
        # mm = [item[0] for item in ms[1:]]
        # mm = list(map(words.__getitem__, mm))
        for n in tqdm(nodes):
            idx = int(n.split('__')[1])
            mm = I[idx]
            if idx < n_items:
                candidates = [_ for _ in mm if idx >= n_items]
            else:
                candidates = [_ for _ in mm if idx < n_items]

            candidates = candidates[:n_candidates]
            most_similar[n] = ['idx__{}'.format(_) for _ in candidates]
        # print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'.format(c / len(nodes) * 100, c, len(nodes)),
        #       end='')
        c += 1
        print('')

    else:
        raise ValueError('Unknown strategy {0}'.format(strategy))

    t_end = dt.datetime.now()
    diff = t_end - t_start
    print('Time required to build sim struct: {}'.format(diff.total_seconds()))
    pickle.dump(most_similar, open('most_similar.pickle', 'wb'))

    return most_similar
Esempio n. 23
0
corpus = os.path.join(corpus_name)
a_file = os.path.join(corpus, "preprocessed_a.txt")
a_embedding_file = os.path.join("embeddings_a.pkl")
model_path = os.path.join("best-model.pt")

answer_embedding_dict = pickle.load(open(a_embedding_file, "rb+"))

dataset = CornellMovieDialogsDataset(
    question_text_filepath=
    'cornell movie-dialogs corpus/preprocessed_q_train.txt',
    answer_embedding_dict=answer_embedding_dict,
    maxlen=30)
tokenizer = dataset.tokenizer
max_len = dataset.maxlen
ngtpy.create(b"answers_index", 768)
index = ngtpy.Index(b"answers_index")
all_answer_embeddings = list(answer_embedding_dict.values())
index.batch_insert(all_answer_embeddings)
index.save()


def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?,])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?,\']+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s


answers = open(a_file, "r").readlines()
Esempio n. 24
0
            if tokens[0].startswith('idx_'):
                fow.write(tokens[0] + '\n')
                fov.write('{0}\n'.format('\t'.join(tokens[1:])))


if __name__ == '__main__':
    emb_path = 'pipeline/embeddings/movies-ER-golden.emb'
    # prepare_files(emb_path)

    index_path = 'index.anng'

    with open('objects.tsv', 'r') as fin:
        n, dim = map(int, fin.readline().split())
        ngtpy.create(index_path, dim,
                     distance_type='Cosine')  # create an empty index
        index = ngtpy.Index(index_path)  # open the index
        for line in fin:
            object = list(map(float, line.rstrip().split('\t')))
            index.insert(object)  # insert objects
    index.build_index()  # build the index
    index.save()  # save the index

    with open('words.tsv', 'r') as fin:
        words = list(map(lambda x: x.rstrip('\n'), fin.readlines()))

    index = ngtpy.Index('index.anng')  # open the index
    query_id = 31
    query_object = index.get_object(query_id)  # get the object

    result = index.search(query_object,
                          epsilon=0.10)  # approximate nearest neighbor search
Esempio n. 25
0
import ngtpy
import random

dim = 10
objects = []
for i in range(0, 100) :
    vector = random.sample(range(100), dim)
    objects.append(vector)

query = objects[0]

ngtpy.create(b"tmp", dim)
index = ngtpy.Index(b"tmp")
index.batch_insert(objects)
index.save()

result = index.search(query, 3)

for i, o in enumerate(result) :
    print(str(i) + ": " + str(o[0]) + ", " + str(o[1]))
    object = index.get_object(o[0])
    print(object)
Esempio n. 26
0
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join(corpus_name)
q_file = os.path.join(corpus, "preprocessed_q.txt")
qa_file = os.path.join(corpus, "preprocessed_qa.txt")
q_embedding_file = os.path.join("embeddings_q.pkl")
qa_embedding_file = os.path.join("embeddings_qa.pkl")

bc = BertClient(check_length=False)
f = open(q_embedding_file, "rb+")
all_question_embedding = list((pickle.load(f)).values())
f = open(qa_embedding_file, "rb+")
all_question_answer_embedding = list((pickle.load(f)).values())

ngtpy.create(b"questions_index", len(all_question_embedding[0]))
index = ngtpy.Index(b"questions_index")
index.batch_insert(all_question_embedding)
index.save()


def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?,])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?,\']+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s


QA = open(qa_file, "r").readlines()

while (True):
Esempio n. 27
0
import csv
import ngtpy

# create an index framwork in filesystem.
ngtpy.create(path=b'index', dimension=128, distance_type="L2")

# load objects.
objects = []
with open(b'../../data/sift-dataset-5k.tsv', 'r') as fp:
    for object in csv.reader(fp, delimiter = '\t'):
        objects.append(object[0:128])

# open index.
index = ngtpy.Index(b'index')

# insert the objects.
index.batch_insert(objects)

# save the index.
index.save()

# close the index.
index.close()

# open the index.
index = ngtpy.Index(b'index')

# load query data.
with open(b'../../data/sift-query-3.tsv', 'r') as fp:
    query = list(csv.reader(fp, delimiter = '\t'))
Esempio n. 28
0
        path[i] = re.sub("\)", "", path[i])
        path[i] = re.sub('[0-9]', '', path[i])
        path[i] = re.sub(':', '', path[i])
        path[i] = path[i].lower()
    path = " ||| ".join(path[1:])
    embedding = bc.encode([path])

    all_embeddings.append(embedding.tolist())

with open("full_path_embeddings.pkl", "wb") as write_file:
    pickle.dump(all_embeddings, write_file)

all_embeddings = pickle.load(open("full_path_embeddings.pkl", "rb"))
print(len(all_embeddings[0][0]))
ngtpy.create(b"event_index", len(all_embeddings[0][0]))
index = ngtpy.Index(b"event_index")
for i in range(len(all_embeddings)):
    all_embeddings[i] = all_embeddings[i][0]
index.batch_insert(all_embeddings)
index.save()

all_event_mappings = open("all_event_mappings_bert_base_uncased.txt", "w")
all_events_ntsb = open("all_events.txt").readlines()
for event in all_events_ntsb:
    event_embedding = bc.encode([event.lower()])
    result = index.search(event_embedding, 1)
    for i, o in enumerate(result):
        event_mapping = " --> ".join(paths_to_leaves[int(str(o[0]))])
        all_event_mappings.write(event + "===" + event_mapping + '\n')

event_path_mappings_non_leaf = dict()