Beispiel #1
0
class spherefaceAnnoyDatabase():
	def __init__(self):
		self.network = caffe.Net("pretrainedModels/sphereface_deploy.prototxt", "pretrainedModels/sphereface_model.caffemodel",0)
		self.index = AnnoyIndex(512, metric='angular') # 512 is the number of neurons in the last layer of the net
		self.indexToName = {}
		self.nameToIndex = {}

	def getEmbedding(self, imgPath):
		img = Image.open(imgPath)
        	sampleImage = numpy.array(img.resize((net.blobs['data'].data.shape[3],net.blobs['data'].data.shape[2])))
        	sampleImage = numpy.reshape(sampleImage,(1,)+sampleImage.shape).transpose(0,3,1,2).astype(numpy.float32)
        	net.blobs['data'].data[...]=sampleImage
        	net.forward()
		return net.blobs['fc5'].data[0].copy()
	
	def addFaceWithName(self, imgPath, name):
		embedding = self.getEmbedding(imgPath)
		length = self.index.get_n_items()
        	self.index.add_item(length, embedding)
		self.indexToName[length] = name
		self.nameToIndex[name] = length
	
	def addEmbeddingWithName(self, embedding, name):
		length = self.index.get_n_items()
        	self.index.add_item(length, embedding)
		self.indexToName[length] = name
		self.nameToIndex[name] = length
	
	def addFaceWithoutName(self, imgPath):
		embedding = self.getEmbedding(imgPath)
                length = self.index.get_n_items()
                self.index.add_item(length, embedding)
                self.indexToName[length] = imgPath
                self.nameToIndex[imgPath] = length

	def freeze(self, nTrees = 20):
		self.index.build(nTrees)

	def lookupByFace(self, imgPath, numberOfNeighbours):
		embedding = self.getEmbedding(imgPath)
		results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True)
		for i in xrange(len(results[0])):
                        results[0][i] = self.indexToName[results[0][i]]
		return results
	
	def lookupByEmbedding(self, embedding, numberOfNeighbours):
		if(numberOfNeighbours==-1):
			numberOfNeighbours = self.index.get_n_items()
		results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True)
		for i in xrange(len(results[0])):
                        results[0][i] = self.indexToName[results[0][i]]
		return results
	
	def lookupByName(self, name, numberOfNeighbours):
		if(numberOfNeighbours==-1):
			numberOfNeighbours = self.index.get_n_items()
		results = self.index.get_nns_by_item(self.nameToIndex[name], numberOfNeighbours, search_k=-1, include_distances=True)
		for i in xrange(len(results[0])):
			results[0][i] = self.indexToName[results[0][i]]
		return results
Beispiel #2
0
def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"):
    index = AnnoyIndex(df.shape[1])
    index.load(index_file)
    patient_dict = {}
    for key, val in csv.reader(open(patient_dict_file)):
        patient_dict[key] = int(val)
    index_dict = {}
    for key, val in csv.reader(open(index_dict_file)):
        index_dict[int(key)] = val
    print("Computing nearest-neighbors...")
    neighbor_dict = {}
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False)
        neighbor_ids = [index_dict[x] for x in neighbors]
        neighbor_dict[patient_id] = neighbor_ids
    f = open(out_dir+"patient_walks.txt", 'wb')
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        patient_sentences = ""
        for j in range(walks_per_patient):
            sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict,
                                        n_neighbors=n_neighbors,walk_size=walk_size)
            patient_sentences = sentence + "\n"
            ## Write it ##
        f.write(patient_sentences)
Beispiel #3
0
def build_annoy_indices(input_words, input_vectors):
    print("Building Annoy Indices: {0}".format(datetime.now().time()))
    sem = AnnoyIndex(99, metric="euclidean")
    phon = AnnoyIndex(100, metric="euclidean")

    index = 0
    print("Reading Data for Semantic Index: {0}".format(datetime.now().time()))
    for row in open("semantic_vectors_weighted82.txt"):
        spl = row.find("@@@")
        line = row[0:spl - 1].lower()
        vec = row[spl + 3:-1]
        vals = np.array([float(val) for val in vec.split(", ")])
        if line not in lookup:
            sem.add_item(index, vals)
            slines[index] = line
            lookup[line] = [index]
            index += 1
        if index % 100000 == 0:
            print("......{0} vectors loaded.".format(index))

    last_index = index + 1
    for i in range(len(input_words)):
        sem.add_item(last_index, input_vectors[i]
                     )  #add input vector so its neighbors can be calculated
        lookup[input_words[i]] = [last_index]
        slines[last_index] = input_words[i]
        last_index += 1

    print("Building Semantic Index: {0}".format(datetime.now().time()))
    sem.build(150)
    print("Built: {0}".format(datetime.now().time()))
    print("Num items in semantic index: {0}".format(sem.get_n_items()))

    print("Reading Data for Phonetic Index: {0}".format(datetime.now().time()))
    pindex = 0
    for row in open("phonetic_vectors_every2_d100_reformatted.txt"):
        spl = row.find("@@@")
        line = row[0:spl - 1]
        stripped_line = line[2:-1].lower()  #skip the b''
        vec = row[spl + 3:-1]
        vals = np.array([float(val) for val in vec.split(", ")])
        if stripped_line in lookup:
            phon.add_item(pindex, vals)
            lookup[stripped_line].append(pindex)
            plines[pindex] = stripped_line
            pindex += 1
        if pindex % 100000 == 0:
            print("......{0} vectors loaded.".format(pindex))

    print("Building Phonetic Index: {0}".format(datetime.now().time()))
    phon.build(150)
    print("Built: {0}".format(datetime.now().time()))
    print("Num items in phonetic index: {0}".format(phon.get_n_items()))

    print("Done Building Annoy Indices: {0}".format(datetime.now().time()))
    return sem, phon
Beispiel #4
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEquals(a.get_n_items(), 4)
     a.get_item_vector(3)
     a.save('something.annoy')
     self.assertEquals(a.get_n_items(), 4)
     a.get_item_vector(3)
Beispiel #5
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3, 'angular')
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Beispiel #6
0
    def test_get_n_items(self):
        print "test_get_n_items"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0)
        #i.verbose(True)
        i.create()

        i.add_item(0, [0, 0, 1])
        self.assertEqual(i.get_n_items(), 1)
        i.add_item(1, [0, 1, 0])
        self.assertEqual(i.get_n_items(), 2)
        i.add_item(2, [1, 0, 0])
        self.assertEqual(i.get_n_items(), 3)
Beispiel #7
0
    def test_get_n_items(self):
        print "test_get_n_items"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0)
        #i.verbose(True)
        i.create()

        i.add_item(0, [0, 0, 1])
        self.assertEqual(i.get_n_items(), 1);
        i.add_item(1, [0, 1, 0])
        self.assertEqual(i.get_n_items(), 2);
        i.add_item(2, [1, 0, 0])
        self.assertEqual(i.get_n_items(), 3);
Beispiel #8
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Beispiel #9
0
def main():
    t = AnnoyIndex(99, metric='euclidean')
    lines = dict()
    lookup = dict()

    print("loading...")
    index = 0
    for row in open("semantic_vectors_weighted91.txt"):
        spl = row.find("@@@")
        line = row[0:spl-1].lower()
        vec = row[spl+3:-1]
        vals = np.array([float(val) for val in vec.split(", ")])
        if line in lookup:
            continue
        t.add_item(index, vals)
        lines[index] = line
        lookup[line] = [index]
        index += 1
        if index % 50000 == 0:
            print(line)
            print("{0} vectors loaded".format(index))
    print("building")
    t.build(100)
    print("done.")

    nums1 = [random.randint(1, t.get_n_items()) for i in range(5)]
    nums2 = [random.randint(1, t.get_n_items()) for i in range(5)]
    
    poem = [nums1, nums2]

    for s in poem:
        for line in s:
            print(lines[line])
        print("\n")
Beispiel #10
0
def k_neighbors(shape_features: {}, db_features: {}, k=s.KNN_SIZE) -> []:
    """
    It determines the closest shape to the query shape by computing K-Nearest Neighbors on a
    N-dimensional Approximate Nearest Neighbors feature mapping.
    ----------------------------
    Args:
        shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape
        db_features (obj: 'dict): The dictionary containing the feature metrics of the shapes
        k (int): The number of neighbors to return, the default value specified in Settings

    Returns:
        neighbors (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective
                                    distance to the query shape (value)
    """
    ann = AnnoyIndex(56, 'euclidean')  # 56 features
    for id, featureList in db_features.items():
        features_flatten = flatten_features_array(featureList)
        ann.add_item(id, features_flatten)

    shape_features_flat = flatten_features_array(shape_features)

    # To get the neighbors, it is necessary to add the new item to the mapping first
    shape_id = ann.get_n_items()
    ann.add_item(shape_id, shape_features_flat)

    ann.build(s.CATEGORIES)

    neighbors = ann.get_nns_by_item(shape_id, k, include_distances=True)

    return neighbors
Beispiel #11
0
def AddToTrain(individual):
    global annoy_train
    global test_db
    global IND_SIZE
    global config

    max_memory = 5
    if set.get_master_volume() == 1:
        print set.get_master_volume()
        set.set_master_volume(0.85)

        test_db.append(individual)
        print "SAVING TO TRAINING SET. TestDB Size: " + str(len(test_db))

        annoy_train = AnnoyIndex(IND_SIZE)
        annoy_train.add_item(annoy_train.get_n_items(), individual)
        annoy_train.build(config["annoy_tree"])  # 10 trees

        if len(test_db) > max_memory:
            test_db.pop(0)
            print "delete old memory entry"

    if set.get_master_volume() == 0:
        test_db = []
        # gen_record = []
        annoy_train = AnnoyIndex(IND_SIZE)
        annoy_train.build(config["annoy_tree"])  # 10 trees
        print "clean set"
        set.set_master_volume(0.85)
Beispiel #12
0
def build_index(embedding_fun, batch_size, sentences):
    ann = AnnoyIndex(D)
    batch_sentences = []
    batch_indexes = []
    last_indexed = 0
    num_batches = 0
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        with open('wiki.txt.uniq', 'r') as fr:
            for sindex, sentence in enumerate(fr):
                batch_sentences.append(sentence)
                batch_indexes.append(sindex)

                if len(batch_sentences) == batch_size:
                    context_embed = sess.run(
                        embedding_fun, feed_dict={sentences: batch_sentences})
                    for index in batch_indexes:
                        ann.add_item(index,
                                     context_embed[index - last_indexed])
                        batch_sentences = []
                        batch_indexes = []
                    last_indexed += batch_size
                    if num_batches % 10000 == 0:
                        print_with_time('sindex: {} annoy_size: {}'.format(
                            sindex, ann.get_n_items()))
                    num_batches += 1
            if batch_sentences:
                context_embed = sess.run(
                    embedding_fun, feed_dict={sentences: batch_sentences})
                for index in batch_indexes:
                    ann.add_item(index, context_embed[index - last_indexed])
    return ann
Beispiel #13
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
Beispiel #14
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
Beispiel #15
0
    def test_get_n_item(self):
        print "test_get_n_item"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0)
        i.create()

        i.add_item(0, [0, 0, 1])
        i.add_item(1, [0, 1, 0])
        i.add_item(2, [1, 0, 0])
        i1 = i.get_n_items(0)
        self.assertEqual(i1, [0, 0, 1])
        i2 = i.get_n_items(1)
        self.assertEqual(i2, [0, 1, 0])
        i3 = i.get_n_items(2)
        self.assertEqual(i3, [1, 0, 0])
Beispiel #16
0
    def test_get_n_item(self):
        print "test_get_n_item"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0)
        i.create()

        i.add_item(0, [0, 0, 1])
        i.add_item(1, [0, 1, 0])
        i.add_item(2, [1, 0, 0])
        i1 = i.get_n_items(0);
        self.assertEqual(i1, [0, 0, 1]);
        i2 = i.get_n_items(1);
        self.assertEqual(i2, [0, 1, 0]);
        i3 = i.get_n_items(2);
        self.assertEqual(i3, [1, 0, 0]);
Beispiel #17
0
 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
Beispiel #18
0
 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
def main():
    args = setup_args()
    print_with_time(args)

    start_time = time.time()
    ann = AnnoyIndex(args.vector_size, metric='angular')
    ann.load(args.ann)
    end_time = time.time()
    print('Load Time: {}'.format(end_time - start_time))

    print_with_time('Annoy Index: {}'.format(ann.get_n_items()))

    start_time = time.time()
    df = read_data(args.csv_file_path, args.filter_data)
    content_array = df.to_numpy()
    end_time = time.time()
    print_with_time('Sentences: {} Time: {}'.format(len(content_array), end_time - start_time))

    # start_time = time.time()
    # embed_fn = hub.load(args.use_model)
    # end_time = time.time()
    # print_with_time('Model loaded time: {}'.format(end_time - start_time))

    random_projection_matrix = None

    if args.random_projection:
      if os.path.exists('random_projection_matrix'):
        print("Loading random projection matrix...")
        with open('random_projection_matrix', 'rb') as handle:
          random_projection_matrix = pickle.load(handle)
        print('random projection matrix is loaded.')

    while True:
      input_sentence_id = input('Enter sentence id: ').strip()

      if input_sentence_id == 'q':
        return

      print_with_time('Input Sentence: {}'.format(input_sentence_id))
      query_filter = 'GUID == "' + input_sentence_id + '"'
      input_data_object = df.query(query_filter)
      input_sentence = input_data_object['CONTENT']

      start_time = time.time()
      query_sentence_vector = generate_embeddings(input_sentence.values[0], args.use_model, random_projection_matrix)
      print_with_time('vec done')
      similar_sentences = find_similar_items(ann, query_sentence_vector, content_array, args.k)
      end_time = time.time()
      print_with_time('nns done: Time: {}'.format(end_time-start_time))
      for sentence in similar_sentences[1:]:
        if args.filter_data:
          if sentence[2] in ['country-related', 'person-related']:
            print(sentence[0])
        else:
          print(sentence[0])
Beispiel #20
0
def test_dense_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    index = build_annoy_index(data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5)
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)
Beispiel #21
0
 def test_build_unbuid(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(1000):
         i.add_item(j, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     
     for j in xrange(100):
         i.unbuild()
         i.build(10)
         
     self.assertEqual(i.get_n_items(), 1000)
Beispiel #22
0
 def test_build_unbuid(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(1000):
         i.add_item(j, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     
     for j in xrange(100):
         i.unbuild()
         i.build(10)
         
     self.assertEqual(i.get_n_items(), 1000)
Beispiel #23
0
def get_similar_sentences(query):
    embeddings = load_embeddings()
    sentence_ids = load_sentence_ids()
    index = AnnoyIndex(get_embeddings_dim(embeddings), "angular")
    index.load("index.ann")
    print("Found {} items in the index.".format(index.get_n_items()))
    print("The index uses {} trees.".format(index.get_n_trees()))
    print("")
    closest, dists = index.get_nns_by_vector(
        embeddings[query], 10, include_distances=True)  # noqa: E501
    assert (len(closest) == len(dists))
    closest = map(lambda sid: sentence_ids[sid], closest)
    return zip(closest, dists)
Beispiel #24
0
def generate_extra_pair_basis(basis,
                              X,
                              n_neighbors,
                              tree: AnnoyIndex,
                              distance='euclidean',
                              verbose=True):
    '''Generate pairs that connects the extra set of data to the fitted basis.
    '''
    npr, dimp = X.shape

    assert (
        basis is not None or tree is not None
    ), "If the annoyindex is not cached, the original dataset must be provided."

    # Build the tree again if not cached
    if tree is None:
        n, dim = basis.shape
        assert dimp == dim, "The dimension of the original dataset is different from the new one's."
        tree = AnnoyIndex(dim, metric=distance)
        if _RANDOM_STATE is not None:
            tree.set_seed(_RANDOM_STATE)
        for i in range(n):
            tree.add_item(i, basis[i, :])
        tree.build(20)
    else:
        n = tree.get_n_items()

    n_neighbors_extra = min(n_neighbors + 50, n - 1)
    nbrs = np.zeros((npr, n_neighbors_extra), dtype=np.int32)
    knn_distances = np.empty((npr, n_neighbors_extra), dtype=np.float32)

    for i in range(npr):
        nbrs[i, :], knn_distances[i, :] = tree.get_nns_by_vector(
            X[i, :], n_neighbors_extra, include_distances=True)

    print_verbose("Found nearest neighbor", verbose)
    # sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10)
    # print_verbose("Calculated sigma", verbose)

    # Debug
    # print_verbose(f"Sigma is of the scale of {sig.shape}", verbose)
    # print_verbose(f"KNN dist is of shape scale of {knn_distances.shape}", verbose)
    # print_verbose(f"nbrs max: {nbrs.max()}", verbose)

    # scaling the distances is not possible since we don't always track the basis
    # scaled_dist = scale_dist(knn_distances, sig, nbrs)
    print_verbose("Found scaled dist", verbose)

    pair_neighbors = sample_neighbors_pair_basis(n, X, knn_distances, nbrs,
                                                 n_neighbors)
    return pair_neighbors
Beispiel #25
0
    def merge_indicies(self, index_file_a, index_file_b, sender_urn):
        logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn))
        index_a = AnnoyIndex(self.feat_size, metric='euclidean')
        index_b = AnnoyIndex(self.feat_size, metric='euclidean')
        new_index = AnnoyIndex(self.feat_size, metric='euclidean')

        index_a.load(index_file_a)
        index_b.load(index_file_b)

        cnt = 0
        for i in range(index_a.get_n_items()):
            new_index.add_item(cnt, index_a.get_item_vector(i))
            cnt += 1

        for i in range(index_b.get_n_items()):
            new_index.add_item(cnt, index_b.get_item_vector(i))
            cnt += 1


        new_index_file = index_file_a + ".merged"

        index_a.unload()
        index_b.unload()

        new_index.build(self.n_trees)
        new_index.save(new_index_file)
        logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format(
                index_file_a,
                index_file_b,
                sender_urn,
                cnt))

        new_index.unload()
        pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction(
                new_index_file=new_index_file,
                index_file_a=index_file_a,
                index_file_b=index_file_b
        )
Beispiel #26
0
def make_pairs(id):
    index = AnnoyIndex(dim, 'euclidean')
    index.load(model_path)
    k = min(index.get_n_items() - 1,
            max(50, min(index.get_n_items() / 5,
                        200)))  #eh idk if this is necessary lol
    last_k = 0
    found = 0
    i = 0
    while found < 15 and i < 5 and k < index.get_n_items(
    ):  #loop until we get a satisfactory amount (15 for now), or for 5 iterations
        i += 1
        neighbors = get_neighbors(id, k)[last_k:]
        for neighbor in neighbors:
            if neighbor == id:
                continue
            p = Pair.query.filter_by(hash=str(
                hash(f'{min(id, neighbor)}-{max(id, neighbor)}'))).first()
            print(p)
            if not p:
                found += 1
                new_pair = Pair(min(id, neighbor), max(id, neighbor))
                db_session.add(new_pair)
                db_session.commit()
Beispiel #27
0
def test_build_sparse_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    sparse_data = csr_matrix(data)

    index = build_annoy_index(sparse_data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()
Beispiel #28
0
def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if (os.path.isfile(saving_model)):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ['latitude', 'longitude', 'time_period']])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return (t)
Beispiel #29
0
def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if os.path.isfile(saving_model):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return t
def main():
    t = AnnoyIndex(99, metric='euclidean')
    lines = dict()
    lookup = dict()

    prompt_word = input("Get the nearest semantic neighbors of: ")
    prompt_vec = find_glove_vector(prompt_word)

    print("loading...")
    index = 0
    for row in open("semantic_vectors_weighted91.txt"):
        spl = row.find("@@@")
        line = row[0:spl - 1].lower()
        vec = row[spl + 3:-1]
        vals = np.array([float(val) for val in vec.split(", ")])
        if line in lookup:
            continue
        t.add_item(index, vals)
        lines[index] = line
        lookup[line] = [index]
        index += 1
        if index % 50000 == 0:
            print(line)
            print("{0} vectors loaded".format(index))

    last_index = index + 1
    t.add_item(
        last_index,
        prompt_vec)  #add input vector so its neighbors can be calculated
    lookup[prompt_word] = [last_index]
    lines[last_index] = prompt_word

    t.build(100)
    print("done.")

    print("Num dict items: {0}".format(len(lookup)))
    print("Num list items: {0}".format(len(lines)))
    print("Num index items: {0}".format(t.get_n_items()))

    try:
        vec = prompt_vec
        print(nn_lookup(t, vec))
        print([lines[i[0]] for i in nn_lookup(t, vec)])
    except KeyError:
        print("not found")
Beispiel #31
0
def build_index(annoy_vector_dimension, embedding_fun, batch_size, sentences,
                content_array, stop_words, content_index):
    ann = AnnoyIndex(annoy_vector_dimension, metric='angular')
    batch_sentences = []
    batch_indexes = []
    last_indexed = 0
    num_batches = 0
    content = ''

    with tf.compat.v1.Session() as sess:
        sess.run([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        for sindex, sentence in enumerate(content_array):
            content = sentence[content_index]
            if stop_words:
                content = remove_stopwords(sentence[1])

            batch_sentences.append(content)
            batch_indexes.append(sindex)

            if len(batch_sentences) == batch_size:
                context_embed = sess.run(
                    embedding_fun, feed_dict={sentences: batch_sentences})

                for index in batch_indexes:
                    ann.add_item(index, context_embed[index - last_indexed])
                    batch_sentences = []
                    batch_indexes = []

                last_indexed += batch_size
                if num_batches % 10000 == 0:
                    print_with_time('sindex: {} annoy_size: {}'.format(
                        sindex, ann.get_n_items()))

                num_batches += 1

        if batch_sentences:
            context_embed = sess.run(embedding_fun,
                                     feed_dict={sentences: batch_sentences})
            for index in batch_indexes:
                ann.add_item(index, context_embed[index - last_indexed])

    return ann
Beispiel #32
0
    def load(self):
        self.prev_id = -1
        self.indexes = []
        logger.info("Loading index {0}".format(self.actor_urn))
        for index in self.indexes:
            index.unload()

        for f in sorted(listdir(self.index_dir)):
            if f.endswith(".ann"):
                self.index_files.append(join(self.index_dir,f))
                index = AnnoyIndex(self.feat_size, metric='euclidean')
                index.load(join(self.index_dir, f))
                self.indexes.append(index)
                self.prev_id += index.get_n_items()
            elif f.endswith('saved_state'):
                self.mem_store = np.load(join(self.index_dir, f)).tolist()
        logger.info("Loaded {0} files with total {1} records for index {2}"
                    .format(len(self.indexes), self.prev_id + 1, self.actor_urn))
Beispiel #33
0
def annotate_all_questions():
    embeddings = load_embeddings()
    sentence_ids = load_sentence_ids()
    index = AnnoyIndex(get_embeddings_dim(embeddings), "angular")
    index.load("index.ann")
    print("Found {} items in the index.".format(index.get_n_items()))
    print("The index uses {} trees.".format(index.get_n_trees()))
    print("")

    df = pd.concat(map(dm.ALLEN_AI_OBQA, list(OBQAType)))
    annotations = {}
    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        for answer in row.answers:
            sent = row.question + " " + answer
            closest = index.get_nns_by_vector(embeddings[sent], 75)
            closest = list(map(lambda sid: sentence_ids[sid], closest))
            annotations[sent] = closest
    pickle.dump(annotations, open("annotations.pkl", "wb"))
    print("Annotations written to annotations.pkl")
Beispiel #34
0
def main():
    args = setup_args()
    print_with_time(args)

    start_time = time.time()
    ann = AnnoyIndex(D)
    ann.load(args.ann)
    end_time = time.time()
    print('Load Time: {}'.format(end_time - start_time))

    print_with_time('Annoy Index: {}'.format(ann.get_n_items()))

    start_time = time.time()
    sentences = load_sentences(args.sentences)
    end_time = time.time()
    print_with_time('Sentences: {} Time: {}'.format(len(sentences), end_time - start_time))

    start_time = time.time()
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")
    sentences_ph = tf.placeholder(dtype=tf.string, shape=[None])
    embedding_fun = embed(sentences_ph)

    sess = tf.Session()
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    end_time = time.time()

    print_with_time('Ready! TF setup time: {}'.format(end_time - start_time))
    while True:
        input_sentence = input('Enter sentence: ').strip()

        if input_sentence == 'q':
            return
        print_with_time('Input Sentence: {}'.format(input_sentence))
        start_time = time.time()
        sentence_vector = sess.run(embedding_fun, feed_dict={sentences_ph:[input_sentence]})
        print_with_time('vec done')
        nns = ann.get_nns_by_vector(sentence_vector[0], args.k)
        end_time = time.time()
        print_with_time('nns done: Time: {}'.format(end_time-start_time))
        similar_sentences = [sentences[nn] for nn in nns]
        for sentence in similar_sentences:
            print(sentence)
Beispiel #35
0
class AnnoyClient:
    DIMENSION = 100

    def __init__(self, index_file: str, id_list: List[str]):
        print('Initializing AnnoyIndex...')
        self.index = AnnoyIndex(self.DIMENSION, 'angular')
        self.index.load(index_file)
        self.id_list = id_list
        print('Done')

    def search(self, query: List[float], n: int = 100) -> List[dict]:
        items = self.index.get_nns_by_vector(query, n, include_distances=False)
        print(items)
        return [{
            'id': self.id_list[i],
            'rank': r + 1
        } for (r, i) in enumerate(items)]

    def get_total_count(self) -> int:
        return self.index.get_n_items()
Beispiel #36
0
def build_save_ann_from_iter_lookup(sentence_id_iter,
                                    lookup_fun,
                                    ann_file,
                                    num_trees=10,
                                    log_freq=1000,
                                    batch_size=32,
                                    encoder=None):
    if not encoder:
        encoder = USEEncoder()

    ann = AnnoyIndex(encoder.dim())

    sentences = []
    sentences_ids = []
    for sentence_id in sentence_id_iter:
        sentence = lookup_fun[sentence_id]
        sentence = sentence.strip()
        sentences.append(sentence)
        sentences_ids.append(sentence_id)

        if len(sentences) == batch_size:
            vectors = encoder.encode(sentences)
            for vector, sid in zip(vectors, sentences_ids):
                ann.add_item(sid, vector)
            sentences = []
            sentences_ids = []

            if ann.get_n_items() % (batch_size * log_freq) == 0:
                logging.info(f'Indexed: {ann.get_n_items()}')

    if sentences:
        vectors = encoder.encode(sentences)
        for vector, sid in zip(vectors, sentences_ids):
            ann.add_item(sid, vector)

    logging.info(f'Final Indexed: {ann.get_n_items()}')
    ann.build(num_trees)
    ann.save(ann_file)
    return ann
Beispiel #37
0
def build_index(batch_size, content_array, model_url, random_projection_matrix):
    VECTOR_LENGTH = 512

    if random_projection_matrix is not None:
      VECTOR_LENGTH = 64

    ann = AnnoyIndex(VECTOR_LENGTH, metric=METRIC)

    batch_sentences = []
    batch_indexes = []
    last_indexed = 0
    num_batches = 0

    for sindex, sentence in enumerate(content_array):
      # sentence_embedding = generate_embeddings(sentence[1], model_url, random_projection_matrix)
      # ann.add_item(sindex, sentence_embedding[0])

      batch_sentences.append(sentence[1])
      batch_indexes.append(sindex)

      if len(batch_sentences) == batch_size:
        context_embed = generate_embeddings(batch_sentences, model_url, random_projection_matrix)
        for index in batch_indexes:
          ann.add_item(index, context_embed[index - last_indexed])
          batch_sentences = []
          batch_indexes = []
        last_indexed += batch_size
        if num_batches % 10000 == 0:
          print_with_time('sindex: {} annoy_size: {}'.format(sindex, ann.get_n_items()))
        num_batches += 1

    if batch_sentences:
      context_embed = generate_embeddings(batch_sentences, model_url, random_projection_matrix)
      for index in batch_indexes:
        ann.add_item(index, context_embed[index - last_indexed])

    return ann
Beispiel #38
0
def main():
    t = AnnoyIndex(200, metric='euclidean')
    lines = list()
    lookup = dict()

    print("loading...")
    index = 0
    for row in open("phonetic_vectors_every2_d200_reformatted.txt"):
        spl = row.find("@@@")
        line = row[0:spl - 1]
        stripped_line = line[2:-1].lower()  #skip the b''
        vec = row[spl + 3:-1]
        vals = np.array([float(val) for val in vec.split(", ")])
        if stripped_line in lookup:
            continue
        lookup[stripped_line] = index
        lines.append(stripped_line)
        t.add_item(index, vals)
        index += 1
        if index % 50000 == 0:
            print(stripped_line.lower())
            print("{0} vectors loaded".format(index))
    t.build(100)
    print("done.")

    print("Num dict items: {0}".format(len(lookup)))
    print("Num list items: {0}".format(len(lines)))
    print("Num index items: {0}".format(t.get_n_items()))

    try:
        vec = lookup["skating on thin ice"]
        print(vec)
        print(t.get_item_vector(vec))
        print(nn_lookup(t, t.get_item_vector(vec)))
        print([lines[i[0]] for i in nn_lookup(t, t.get_item_vector(vec))])
    except KeyError:
        print("not found")
Beispiel #39
0
def r_neighbors(shape_features: {}, db_features: {}, r=s.RNN_RANGE) -> []:
    """
    It determines the closest shape to the query shape by computing R-Nearest Neighbors on a
    N-dimensional Approximate Nearest Neighbors feature mapping.
    ----------------------------
    Args:
        shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape
        db_features (obj: 'dict): The dictionary containing the feature metrics of the shapes
        r (int): The distance range, the default value specified in Settings

    Returns:
        neighbors (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective
                                    distance to the query shape (value)
    """
    ann = AnnoyIndex(56, 'euclidean')  # 56 features
    for id, featureList in db_features.items():
        features_flatten = flatten_features_array(featureList)
        ann.add_item(id, features_flatten)

    shape_features_flat = flatten_features_array(shape_features)

    # To get the neighbors, it is necessary to add the new item to the mapping first
    shape_id = ann.get_n_items()
    ann.add_item(shape_id, shape_features_flat)

    ann.build(s.CATEGORIES)

    neighbors = ann.get_nns_by_item(shape_id, 200, include_distances=True)

    range_neighbors = ([], [])
    for i, distance in enumerate(neighbors[1]):
        if distance < r:
            range_neighbors[0].append(neighbors[0][i])
            range_neighbors[1].append(distance)

    return range_neighbors
Beispiel #40
0
class Memory:
    def __init__(self, capacity, state_dim, value_dim):
        self.capacity = capacity
        print("state_dim:", state_dim)
        self.states = np.zeros((capacity, state_dim))
        self.values = np.zeros((capacity, value_dim))

        self.curr_capacity = 0
        self.curr_ = 0
        self.lru = np.zeros(capacity)
        self.tm = 0

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

        self.index = AnnoyIndex(state_dim)
        self.index.set_seed(123)
        self.update_size = 1
        self.build_capacity = 0

    def sample_knn_test(self, state, k):
        inds, dists = self.index.get_nns_by_vector(state,
                                                   k,
                                                   include_distances=True)
        self.tm += 0.01
        self.lru[inds] = self.tm
        return self.states[inds], self.values[inds], dists

    def sample_knn(self, states, k):
        dists = []
        inds = []
        for state in states:
            ind, dist = self.index.get_nns_by_vector(state,
                                                     k,
                                                     include_distances=True)
            inds.append(ind)
            dists.append(dist)
        # inds = np.reshape(np.array(inds), -1)
        self.tm += 0.01
        self.lru[inds] = self.tm
        return self.states[inds], self.values[inds], dists

    def sample(self, n_samples):
        if self.curr_capacity < n_samples or n_samples == 0:
            idx = np.random.choice(np.arange(len(self.states)),
                                   n_samples,
                                   replace=False)
        else:
            idx = np.random.choice(np.arange(self.curr_capacity),
                                   n_samples,
                                   replace=False)
        self.tm += 0.01
        self.lru[idx] = self.tm
        embs = self.states[idx]
        values = self.values[idx]

        return embs, values

    def add_knn(self, states, values):
        self._add_knn(states, values)

    def add_knn_lru(self, states, values):
        self._add_knn(states, values, lru=True)

    def add(self, states, values):
        self._add(states, values)

    def add_lru(self, states, values):
        self._add(states, values, lru=True)

    def add_rand(self, states, values):
        self._add(states, values, rand=True)

    def _insert(self, states, values, indices):
        self.cached_states = self.cached_states + states
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices
        if len(self.cached_states) >= self.update_size:
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            self.states[ind] = self.cached_states[i]
            self.values[ind] = self.cached_values[i]
            self.index.add_item(ind, self.cached_states[i])

        self.index.build(50)
        self.build_capacity = self.curr_capacity

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, state in enumerate(self.states[:self.curr_capacity]):
            self.index.add_item(ind, state)
        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def _add_knn(self, states, values, lru=False):
        # print(states)
        indices = []
        states_ = []
        values_ = []
        for i, _ in enumerate(states):
            if lru:
                if self.curr_capacity >= self.capacity:
                    ind = np.argmin(self.lru)
                else:

                    ind = self.curr_capacity
                    self.curr_capacity += 1
            else:
                if self.curr_capacity >= self.capacity:
                    self.curr_ = (self.curr_ + 1) % self.capacity
                    ind = self.curr_
                else:
                    ind = self.curr_capacity
                    self.curr_capacity += 1

            self.lru[ind] = self.tm
            indices.append(ind)
            states_.append(states[i])
            values_.append(values[i])
        self._insert(states_, values_, indices)

    def _add(self, states, values, rand=False, lru=False):
        for i, state in enumerate(states):
            if self.curr_capacity < self.capacity:
                self.curr_ = (self.curr_ + 1) % self.capacity
                # self.states[self.curr_] = state
                # self.values[self.curr_] = values[i]
                if self.curr_capacity < self.capacity:
                    self.curr_capacity += 1
            else:
                if lru:
                    self.curr_ = np.argmin(self.lru)
                if rand:
                    self.curr_ = np.random.choice(np.arange(
                        self.curr_capacity),
                                                  1,
                                                  replace=False)

                if not lru and not rand:
                    self.curr_ = (self.curr_ + 1) % self.capacity
            self.states[self.curr_] = state
            self.values[self.curr_] = values[i]

    @property
    def length(self):
        # assert self.index.get_n_items() == self.curr_capacity
        # return self.curr_capacity
        return self.index.get_n_items()
class NearSentence(object):
    def __init__(self, fn_word, model_name, model_path):
        self.model = QueryModel(fn_word, model_name, model_path)
        self.queries = []
        self.titles = []

        self.query_index = 0
        self.title_index = 0
        self.query_ann = AnnoyIndex(self.model.dim, metric='euclidean')
        self.title_ann = AnnoyIndex(self.model.dim, metric='euclidean')

    def load_queries(self, fn_query, column):
        print '[In load_queries] Load candidate queries'
        sentences = []
        chunk = []

        vecs = []
        with open(fn_query) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) < column:
                    continue
                chunk.append(ll[column - 1])
                if len(chunk) == 1000:
                    vec, valid_sentence = self.model.get_query_vec(chunk)
                    vec = vec / np.sqrt(np.sum(vec**2, 1, keepdims=True))
                    vecs.extend(list(vec))
                    sentences.extend(valid_sentence)
                    chunk = []
        if len(chunk) > 0:
            vec, valid_sentence = self.model.get_query_vec(chunk)
            vecs.extend(list(vec))
            sentences.extend(valid_sentence)

        print '[In load_queries] Build query annoy tree'
        for s, v in izip(sentences, vecs):
            self.queries.append(s)
            # if vecs == [0] * self.vectorizer.dim:
            #     continue
            self.query_ann.add_item(self.query_index, v)
            self.query_index += 1

        self.query_ann.build(10)
        print '[In load_queries] Size of tree =', self.query_ann.get_n_items()

    def load_titles(self, fn_title, column):
        print '[In load_titles] Load candidate titles'
        sentences = []

        chunk = []
        vecs = []
        with open(fn_title) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) < column:
                    continue
                chunk.append(ll[column - 1])
                if len(chunk) == 1000:
                    vec, valid_sentence = self.model.get_title_vec(chunk)
                    vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True))
                    vecs.extend(list(vec))
                    sentences.extend(valid_sentence)
                    chunk = []
            if len(chunk) > 0:
                vec, valid_sentence = self.model.get_title_vec(chunk)
                vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True))
                vecs.extend(list(vec))
                sentences.extend(valid_sentence)

        print '[In load_titles] Build titles annoy tree, size =', len(vecs)

        for s, v in izip(sentences, vecs):
            self.titles.append(s)
            self.title_ann.add_item(self.title_index, v)     # v is a list
            self.title_index += 1
        self.title_ann.build(10)
        print '[In load_titles] Size of tree =', self.title_ann.get_n_items()



    def get_k_nearest_query(self, query, k):

        if isinstance(query, unicode):
            query = query.encode('utf8')

        cut_data = text_cutter.process({'title': query})
        cut_query = cut_data['cut_title'].decode('utf8')
        vecs, valid_queries= self.model.get_query_vec([cut_query])
        if len(valid_queries) == 0:
            return []
        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]

        k_neighbors, scores = self.query_ann.get_nns_by_vector(vec, n=k, include_distances=True)
        neighbors = []
        for i in k_neighbors:
            neighbors.append(self.queries[i])
        return sorted(zip(neighbors, scores), key=lambda x: x[-1])

    # def sim(self, u, v):
    #     norm_u = u / np.sqrt(np.sum(u ** 2, keepdims=True))
    #     norm_v = u /np.sqrt(np.sum(v ** 2, keepdims=True))
    #     return np.dot(norm_u, norm_v)

    def get_k_nearest_title(self, title, k):
        if isinstance(title, unicode):
            title = title.encode('utf8')

        cut_data = text_cutter.process({'title': title})
        title = cut_data['cut_title'].decode('utf8')
        vecs, valid_titles = self.model.get_title_vec([title])
        if len(valid_titles) == 0:
            return []
        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]
        k_neighbors, scores = self.title_ann.get_nns_by_vector(vec, n=k, include_distances=True)
        neighbors = []
        for i in k_neighbors:
            neighbors.append(self.titles[i])
        return sorted(zip(neighbors, scores), key=lambda x: x[-1])



    def get_answers(self, query, k):
        if isinstance(query, unicode):
            query = query.encode('utf8')

        cut_data = text_cutter.process({'title': query})
        cut_query = cut_data['cut_title'].decode('utf8')
        vecs, valid_queries = self.model.get_query_vec([cut_query])
        if len(valid_queries)==0:
            return []

        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]
        # recall titles according to cosine similarity
        candidate_titles_index, scores = self.title_ann.get_nns_by_vector(vec, n=k*10, include_distances=True)

        # rank candidate titles using model
        candidate_titles = []
        for i in candidate_titles_index:
            candidate_titles.append(self.titles[i])

        ranks = self.model.rank_titles(cut_query, candidate_titles)[:k]
        return ranks


    def process(self, data):
        res = {}
        if 'titles' in data:
            res['title_nns'] = self.get_k_nearest_title(data['titles'], 10)
        if 'queries' in data:
            res['query_nns'] = self.get_k_nearest_query(data['queries'], 10)
        return json.dumps(res, ensure_ascii=False).encode('utf8')