Beispiel #1
1
class ImageSearchAnnoyCombo:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'):
        #load h5 data
        h5f = h5py.File(h5fname,'r')
        self.X = h5f[dset]
        #load filenames
        with open(imageListPath,'r') as f:
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(self.X.shape[1],'angular')
        self.A.load(annf)

    def run_query_approx(self,query,n=100,accuracy_factor = 5):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)

    def run_query_exact(self,query,n=1000,nsmall=100):
        #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory
        #use Annoy
        if n < nsmall:
            n = nsmall
        indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False)
        indexes_sorted = sorted(indexes)
        #use scipy cdist (or normalize first and do dot product for faster computation)
        #getting X by index from disc is very slow. 
        distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0]
        ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n,
        s_ind = np.argsort(distance[ind])#sort 
        nearest = ind[s_ind]
        scoresorted = distance[ind][s_ind]
        return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)
Beispiel #2
0
    def test1_add_item_1(self):
        print "test_set_root"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0)
        #i.verbose(True)
        i.create()
        for k in range(10):
            i.display_node(k)

        i.add_item(0, [0, 0, 1])
        print "after adding 1 data"
        for k in range(10):
            i.display_node(k)

        i.add_item(1, [0, 1, 0])
        print "after adding 2 data"
        for k in range(10):
            i.display_node(k)

        i.add_item(2, [1, 0, 0])
        print "after adding 3 data"
        for k in range(100):
            print "node %d" % k
            i.display_node(k)

       

        print "get nns by vector [3,2,1]"
        self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
Beispiel #3
0
    def test_get_nns_by_vector(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])
        i.build(10)

        self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
Beispiel #4
0
    def test_get_nns_by_vector(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [0, 0, 1])
        i.add_item(1, [0, 1, 0])
        i.add_item(2, [1, 0, 0])
        i.build(10)

        self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
Beispiel #5
0
    def test_get_nns_by_vector(self):
        print "test_get_nns_by_vector "
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0)
        i.add_item(0, [0, 0, 1])
        i.add_item(1, [0, 1, 0])
        i.add_item(2, [1, 0, 0])
      

        self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
Beispiel #6
0
    def recall_at(self, n, n_trees=10, n_points=1000, n_rounds=5):
        # the best movie/variable name
        total_recall = 0.

        for r in range(n_rounds):
            # create random points at distance x
            f = 10
            idx = AnnoyIndex(f, 'dot')

            data = numpy.array([
                [random.gauss(0, 1) for z in range(f)]
                for j in range(n_points)
            ])

            expected_results = [
                sorted(
                    range(n_points),
                    key=lambda j: dot_metric(data[i], data[j])
                )[:n]
                for i in range(n_points)
            ]

            for i, vec in enumerate(data):
                idx.add_item(i, vec)

            idx.build(n_trees)

            for i in range(n_points):
                nns = idx.get_nns_by_vector(data[i], n)
                total_recall += recall(nns, expected_results[i])

        return total_recall / float(n_rounds * n_points)
Beispiel #7
0
 def test_single_vector(self):
     # https://github.com/spotify/annoy/issues/194
     a = AnnoyIndex(3)
     a.add_item(0, [1, 0, 0])
     a.build(10)
     a.save('1.ann')
     self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
Beispiel #8
0
    def test_overwrite_index(self):
        # Issue #335
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)
        t.save('test.ann')

        # Load index file
        t2 = AnnoyIndex(f)
        t2.load('test.ann')

        # Overwrite index file
        t3 = AnnoyIndex(f)
        for i in range(500):
            v = [random.gauss(0, 1) for z in range(f)]
            t3.add_item(i, v)
        t3.build(10)
        if os.name == 'nt':
            # Can't overwrite on Windows
            with self.assertRaises(IOError):
                t3.save('test.ann')
        else:
            t3.save('test.ann')
            # Get nearest neighbors
            v = [random.gauss(0, 1) for z in range(f)]
            nns = t2.get_nns_by_vector(v, 1000)  # Should not crash
Beispiel #9
0
    def retrieve(self):

        print 'Loading necessary files..'
        u = AnnoyIndex(self.dim, metric='angular')
        u.load(index_file)

        print 'ANN Retrieval..'
        for n_neighbors in knns:
            print 'Number of neighbors: ' + str(n_neighbors)
            for mult in self.multipliers:
                print 'Multiplier: ' + str(mult)
                search_k = self.n_trees * n_neighbors * mult
                filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1]))
                with open(self.test_file, 'r') as data_file:
                    data = json.load(data_file)
                    qArray = []
                    for i in range(len(data["questions"])):
                        question_body = data["questions"][i]["body"]
                        question_id = data["questions"][i]["id"]
                        qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim)))

                        anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k)
                        doc_anns = []
                        for n in anns:
                            doc_anns.append(self.idmap[n])
                        q = Question(question_body, question_id, doc_anns)
                        qArray.append(q)
                    directory = "system_results/"
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile:
                        outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))
Beispiel #10
0
    def test_many_vectors(self):
        f = 10
        i = AnnoyIndex(f, 'hamming')
        for x in range(100000):
            i.add_item(x, numpy.random.binomial(1, 0.5, f))
        i.build(10)

        rs, ds = i.get_nns_by_vector([0]*f, 10000, include_distances=True)
        self.assertGreaterEqual(min(ds), 0)
        self.assertLessEqual(max(ds), f)

        dists = []
        for x in range(1000):
            rs, ds = i.get_nns_by_vector(numpy.random.binomial(1, 0.5, f), 1, search_k=1000, include_distances=True)
            dists.append(ds[0])
        avg_dist = 1.0 * sum(dists) / len(dists)
        self.assertLessEqual(avg_dist, 0.42)
Beispiel #11
0
    def test_get_nns_by_vector(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [2,2])
        i.add_item(1, [3,2])
        i.build(10)

        self.assertEquals(i.get_nns_by_vector([3,3], 2), [1, 0])
Beispiel #12
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
class FeatureNN:
    tree = None

    def __init__(self, features, tree_file):
        self.tree = AnnoyIndex(features, metric='euclidean')
        self.tree.load(str(tree_file))

    def nn(self, x):
        return self.tree.get_nns_by_vector(x.tolist(), 1)[0]
Beispiel #14
0
def get_rank(uid):
    """Returns a list of the 10 best ranked items for a user

    This function generates a rank of items for a given user by using 
    Approximate Nearest Neighbours. The algorithm
    is imported from the Annoy library (developed by Spotify). 
    
    Todo: The index is built from scratch everytime the function is called, 
    which definitely should be changed in the future for increased performance. 
    It should be fairly easy to do as ANNOY can store indexes in files which can
    easily been shared by processes. However, it works well with a few hundred items
    as it is now. 

    item_queue: It is a list of item ids for each user. It acts as a circular queue
    for keeping track of which items the user has seen so far. When two new items are 
    shown to the user, they are placed in the back of the queue. 

    Args:
        uid (int): User ID

    Returns: 
        List of item ids (str)
    """
    ann = AnnoyIndex(data_dimension)
    try:
        items = db.items.find()
        q = db.users.find({"uid": uid}, {"item_queue" : 1, "_id": 0})[0]["item_queue"]
    except TypeError:
        print "Unable to fetch user from DB"
    ids = [i["vid"] for i in q ]
    # Following line can be deleted or modified. 
    # It removes the last 15 items from the ANN tree, so they will never be recommended
    # for the user. This is done to make sure the user only sees new items in the 
    # recommended list (assuming 15 is the number of comparisons the user has made). 
    # This is sort of a hack and can be removed/modified later on if necessary.
    ids[-15:] = []
    print ids
    id_dict = {}
    # Add items to ANN tree
    for i,item in enumerate(items):
        if item["vid"] in ids:
            # Store all ids in a dictionary
            id_dict[str(i)] = item["vid"]
            ann.add_item(i, item["vals"])
    # Erik Bernhardson (aurthor of ANNOY) suggests to use 2*dimension of data as the number
    # of trees to build. 
    ann.build(data_dimension*2)
    try: 
        user = db.users.find({"uid": uid})[0]
    except TypeError:
        print "Unable to fetch user from DB"
    # Get 10 highest ranked items for that user
    nns_tmp = ann.get_nns_by_vector(user["vals"],10)
    nns = [id_dict[str(k)] for k in nns_tmp]
    print nns
    return nns
Beispiel #15
0
 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
Beispiel #16
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    f = data.base.shape[1]

    for ntrees in args.ntrees:
        t = AnnoyIndex(f)   # Length of item vector that will be indexed
        idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees)
        if not os.path.exists(idxpath):
            logging.info("Adding items ...")
            for i in xrange(data.nbae):
                t.add_item(i, data.base[i])
                if i % 100000 == 0:
                    logging.info("\t%d/%d" % (i, data.nbae))
            logging.info("\tDone!")
            logging.info("Building indexes ...")
            t.build(ntrees)
            logging.info("\tDone!")
            t.save(idxpath)
        else:
            logging.info("Loading indexes ...")
            t.load(idxpath)
            logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Beispiel #17
0
class ANN:
    def __init__(self, dimension):
        self.ann = AnnoyIndex(dimension)
    def addVectors(self,vectors):
        for idx,v in enumerate(vectors):
            self.ann.add_item(idx,v)
        self.ann.build(10)
    def query(self,vector):
        match = self.ann.get_nns_by_vector(vector,1)[0]
        # return self.ann.get_item_vector(match),match
        return match
    def save(self):
        self.ann.save("analogies.ann")
    def load(self,filename):
        self.ann.load(filename)
Beispiel #18
0
class ImageSearchAnnoy:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,dimensions,annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt'):
        #load filenames
        with open(imageListPath,'r') as f:
            #self.line_to_file = {i:line.split('/')[-1].rstrip() for i,line in enumerate(f)}
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(dimensions,'angular')
        self.A.load(annf)

    def run_query(self,query,n=100,accuracy_factor = 2):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)
Beispiel #19
0
 def test_random_holes(self):
     f = 10
     index = AnnoyIndex(f)
     valid_indices = random.sample(range(2000), 1000) # leave holes
     for i in valid_indices:
         v = numpy.random.normal(size=(f,))
         index.add_item(i, v)
     index.build(10)
     for i in valid_indices:
         js = index.get_nns_by_item(i, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
     for i in range(1000):
         v = numpy.random.normal(size=(f,))
         js = index.get_nns_by_vector(v, 10000)
         for j in js:
             self.assertTrue(j in valid_indices)
Beispiel #20
0
    def precision(self, n, n_trees=10, n_points=10000):
        # create random points at distance x
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            p = [random.gauss(0, 1) for z in xrange(f)]
            norm = sum([pi ** 2 for pi in p]) ** 0.5
            x = [pi / norm * j for pi in p]
            i.add_item(j, x)

        i.build(n_trees)

        nns = i.get_nns_by_vector([0] * f, n)
        self.assertEquals(nns, sorted(nns)) # should be in order
        # The number of gaps should be equal to the last item minus n-1
        found = len([x for x in nns if x < n])
        return 1.0 * found / n
Beispiel #21
0
    def RunAnnAnnoy():
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      referenceData = np.genfromtxt(self.dataset[0], delimiter=',')
      queryData = np.genfromtxt(self.dataset[1], delimiter=',')
      train, label = SplitTrainData(self.dataset)

      # Parse options.
      if not "k" in options:
        Log.Fatal("Required option: Number of furthest neighbors to find.")
        return -1
      else:
        k = int(options.pop("k"))
        if (k < 1 or k > referenceData.shape[0]):
          Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0"
              + " and less or equal than " + str(referenceData.shape[0]))
          return -1
      if not "num_trees" in options:
        Log.Fatal("Required option: Number of trees to build")
        return -1
      else:
        n = int(options.pop("num_trees"))

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      with totalTimer:
        # Get all the parameters.
        try:
          # Perform Approximate Nearest-Neighbors
          acc = 0
          t = AnnoyIndex(train.shape[1])
          for i in range(len(train)):
              t.add_item(i,train[i])
          t.build(n)
          for i in range(len(queryData)):
              v = t.get_nns_by_vector(queryData[i],k)
        except Exception as e:
          Log.Info(e)
          return -1
      time = totalTimer.ElapsedTime()
      return time
Beispiel #22
0
class SimilarStringStore:

    def __init__(self, **kwargs):

        self.transformer = FeatureGenerator(k=1)

        print(self.transformer.n_features)

        self.store = AnnoyIndex(self.transformer.n_features)

    def vectorize(self, s):
        return self.transformer.transform(s)

    def add(self, id, s):
        ''' add a string to index '''

        vector = self.transformer.transform(s)
        self.store.add_item(int(id), vector)
        return vector

    def build(self):
        self.store.build(500)

    def save(self, filename='store.knn'):
        self.store.save(filename)

    def build_and_save(self, filename='store.knn'):
        self.build()
        self.save(filename)

    def load(self, filename='store.knn'):
        self.store.load(filename)


    def query(self, s):
        ''' query index '''
        vector = self.transformer.transform(s)
        neighbors = self.store.get_nns_by_vector(vector, 40)
        return neighbors


    def remove(self, id):
        ''' remove a string from the index '''
        pass
Beispiel #23
0
    def test_get_nns_with_distances(self):
        f = 3
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [0, 0, 2])
        i.add_item(1, [0, 1, 1])
        i.add_item(2, [1, 0, 0])
        i.build(10)

        l, d = i.get_nns_by_item(0, 3, -1, True)
        self.assertEqual(l, [0, 1, 2])
        self.assertAlmostEqual(d[0]**2, 0.0)
        self.assertAlmostEqual(d[1]**2, 2.0)
        self.assertAlmostEqual(d[2]**2, 5.0)

        l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
        self.assertEqual(l, [1, 0, 2])
        self.assertAlmostEqual(d[0]**2, 6.0)
        self.assertAlmostEqual(d[1]**2, 8.0)
        self.assertAlmostEqual(d[2]**2, 9.0)
Beispiel #24
0
    def precision(self, n, n_trees=10, n_points=10000, n_rounds=10):
        found = 0
        for r in xrange(n_rounds):
            # create random points at distance x
            f = 10
            i = AnnoyIndex(f, 'euclidean')
            for j in xrange(n_points):
                p = [random.gauss(0, 1) for z in xrange(f)]
                norm = sum([pi**2 for pi in p])**0.5
                x = [pi / norm * j for pi in p]
                i.add_item(j, x)

            i.build(n_trees)

            nns = i.get_nns_by_vector([0] * f, n)
            self.assertEqual(nns, sorted(nns))  # should be in order
            # The number of gaps should be equal to the last item minus n-1
            found += len([x for x in nns if x < n])

        return 1.0 * found / (n * n_rounds)
Beispiel #25
0
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10):
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))

    return match
Beispiel #26
0
    def precision(self, n, n_trees=10, n_points=10000, n_rounds=10):
        found = 0
        for r in xrange(n_rounds):
            os.system("rm -rf test_db")
            os.system("mkdir test_db")
            # create random points at distance x from (1000, 0, 0, ...)
            f = 10
            i = AnnoyIndex(f, 10, "test_db", n_trees, 1000, 3048576000, 0)
            for j in xrange(n_points):
                p = [random.gauss(0, 1) for z in xrange(f - 1)]
                norm = sum([pi ** 2 for pi in p]) ** 0.5
                x = [1000] + [pi / norm * j for pi in p]
                i.add_item(j, x)

            nns = i.get_nns_by_vector([1000] + [0] * (f-1), n)
            self.assertEqual(nns, sorted(nns))  # should be in order
            # The number of gaps should be equal to the last item minus n-1
            found += len([x for x in nns if x < n])

        return 1.0 * found / (n * n_rounds)
Beispiel #27
0
def predict_annoy(descriptors):
    u = AnnoyIndex(config.index_descriptor_length, config.index_annoydist)
    u.load(config.reference_index_path)  # super fast, will just mmap the file
    from annoytest import get_sheet_for_id, sheets

    votes = {k: 0 for k in sheets}
    for desc in descriptors:
        # will find the k nearest neighbors
        NN_ids = u.get_nns_by_vector(
            desc, config.index_k_nearest_neighbours,
            include_distances=True)  # will find the n nearest neighbors
        distances = NN_ids[1]
        NN_ids = NN_ids[0]

        if config.index_lowes_test_ratio:
            if min(distances) < config.index_lowes_test_ratio * max(distances):
                # good match
                NN_ids = [NN_ids[0]]
            else:
                continue

        NN_names = [get_sheet_for_id(i) for i in NN_ids]
        # vote for the nearest neighbours (codebook response)
        for name in NN_names:
            if config.index_voting_scheme == "antiprop":
                votes[name] += 1 / (NN_names.index(name) + 1
                                    )  # antiproportional weighting
            else:
                # todo: allow other voting schemes in config
                raise NotImplementedError(
                    "voting scheme '%s' not implemented" %
                    config.index_voting_scheme)

    if votes == {}:
        print("truth not in index")
        return -1

    votes = sorted(votes.items(), key=lambda x: x[1], reverse=True)
    # print("truth:",class_label_truth,"index:",[x[0] for x in votes].index(class_label_truth))

    return votes  # most similar prediction is in 0th position
Beispiel #28
0
def AnnoyInfer(filename, ids, vec_len):

  u = AnnoyIndex(vec_len, metric='euclidean')
  u.load('item_title_vec.ann') 
  for line in open(filename):
    sp = line.strip().split('\t')
    vec = vec_len * [0]
    idx = 0
    for f in sp[2].strip().split(','):
      vec[idx] = float(f)
      idx = idx + 1

    list, score = u.get_nns_by_vector(vec, 100, 1000, include_distances=True)
    res = []
    j = 0
    length = len(list)
    while (j < length):
      res.append(ids[list[j]] + "," + str(score[j]))
      j = j + 1
      idx = idx + 1
    print (sp[0] + "\t" + ",".join(res))
def SMOTE(X, k, oversample_times, aknn_positive):
    # load AnnoyIndex
    feature_dim = X.shape[1]
    index = AnnoyIndex(feature_dim)
    index.load(aknn_positive)
    # generate synthetic examples
    X_new = []
    for i in range(X.shape[0]):
        x = X[i]
        knn = list( set(index.get_nns_by_vector(x, k+1)) - set([i]) ) # get NNs excluding the element itself
        for j in range(int(oversample_times)):
            x2 = X[knn[np.random.randint(k)]]
            x_new = x + (x2-x)*np.random.rand()
            X_new.append(x_new)
        if np.random.rand()<=(oversample_times-int(oversample_times)): # dealing with fractions
            x2 = X[knn[np.random.randint(k)]]
            x_new = x + (x2-x)*np.random.rand()
            X_new.append(x_new)
    X_new = np.asarray(X_new, dtype=np.float)
    X = np.concatenate((X, X_new))
    return X
Beispiel #30
0
def testTrain(testL, testI, trainL, trainI):
    u = AnnoyIndex(784)
    u.load('test.ann')
    sumCorrect = 0
    for x in xrange(len(testI)):
        if x % 100 == 0:
            print(x)
        vec = []
        for y in testI[x]:
            for z in y:
                vec.append(z)
        guess = u.get_nns_by_vector(vec, 1)
        while isinstance(guess, list):
            guess = guess[0]

        if trainL[guess] == testL[x]:
            sumCorrect = sumCorrect + 1
        else:
            print("wrong! {} != {}".format(trainL[guess], testL[x]))
    print("{}/10000 correct!".format(sumCorrect))
    return sumCorrect
Beispiel #31
0
def predict_type_embed(types_embed_array: np.array,
                       types_embed_labels: np.array, indexed_knn: AnnoyIndex,
                       k: int) -> List[dict]:
    """
    Predict type of given type embedding vectors
    """

    pred_types_embed = []
    pred_types_score = []
    for i, embed_vec in enumerate(
            tqdm(types_embed_array,
                 total=len(types_embed_array),
                 desc="Finding KNNs & Prediction")):
        idx, dist = indexed_knn.get_nns_by_vector(embed_vec,
                                                  k,
                                                  include_distances=True)
        pred_idx_scores = compute_types_score(dist, idx, types_embed_labels)
        pred_types_embed.append([i for i, s in pred_idx_scores])
        pred_types_score.append(pred_idx_scores)

    return pred_types_embed, pred_types_score
Beispiel #32
0
def label_approx(X, sites, site_labels):
    from annoy import AnnoyIndex

    assert (X.shape[1] == sites.shape[1])

    # Build index over site points.
    aindex = AnnoyIndex(sites.shape[1], metric='euclidean')
    for i in range(sites.shape[0]):
        aindex.add_item(i, sites[i, :])
    aindex.build(10)

    labels = []
    for i in range(X.shape[0]):
        # Find nearest site point.
        nearest_site = aindex.get_nns_by_vector(X[i, :], 1)
        if len(nearest_site) < 1:
            labels.append(None)
            continue
        labels.append(site_labels[nearest_site[0]])

    return np.array(labels)
Beispiel #33
0
class face_annoy:

    def __init__(self):
        self.f                = int(face_comm.get_conf('annoy','face_vector'))
        self.annoy_index_path = os.path.abspath(face_comm.get_conf('annoy','index_path'))
        self.lmdb_file        =os.path.abspath(face_comm.get_conf('lmdb','lmdb_path'))
        self.num_trees        =int(face_comm.get_conf('annoy','num_trees'))

        self.annoy = AnnoyIndex(self.f)
        if os.path.isfile(self.annoy_index_path):
            self.annoy.load(self.annoy_index_path)

    #从lmdb文件中建立annoy索引
    def create_index_from_lmdb(self):
        # 遍历
        lmdb_file = self.lmdb_file
        if os.path.isdir(lmdb_file):
            evn = lmdb.open(lmdb_file)
            wfp = evn.begin()
            annoy = AnnoyIndex(self.f)
            for key, value in wfp.cursor():
                key = int(key)
                print(type(value))
                value = np.fromstring(value,dtype=np.float32)
                print(value.shape)
                annoy.add_item(key,value)

            annoy.build(self.num_trees)
            annoy.save(self.annoy_index_path)

    #重新加载索引
    def reload(self):
        self.annoy.unload()
        self.annoy.load(self.annoy_index_path)

    #根据人脸特征找到相似的
    def query_vector(self,face_vector):
        n=int(face_comm.get_conf('annoy','num_nn_nearst'))
        print(face_vector.shape)
        return self.annoy.get_nns_by_vector(face_vector,n,include_distances=True)
Beispiel #34
0
class AnnoyIndexing:
    """
        Index features by AnnoyIndex.
        Parameters
        ----------
        path_index : list
            path to index files. (.ann)
        distance_type : str
            distance type for index, ex: euclidean ...
        length :int
            length feature vector

        """
    def __init__(self, **kwargs):
        self.length = kwargs.pop('length', False)
        if not self.length:
            raise Exception(
                "AnnoyIndexing() missing 1 required positional argument: length"
            )
        self.distance_type = kwargs.pop('distance_type', False)
        if not self.distance_type:
            raise Exception(
                "AnnoyIndexing() missing 1 required positional argument: distance_type"
            )
        self.path_index = kwargs.pop('path_index', False)
        if not self.path_index:
            raise Exception(
                "AnnoyIndexing() missing 1 required positional argument: path_index"
            )
        self.index = AnnoyIndex(self.length, self.distance_type)
        self.index.load(self.path_index)

    def get_knn(self, feature, k):
        '''
            Get k results from index
        '''
        vector_k = self.index.get_nns_by_vector(feature,
                                                k,
                                                include_distances=True)
        return vector_k
Beispiel #35
0
def create_collage(input_image, profile_name, version_count):
    """
    given an input image and an existing profile, create a set of new collages
    """
    profile_folder = PROFILES_DIRECTORY + profile_name + "/"
    if not os.path.exists(OUTPUT_DIRECTORY):
        os.makedirs(OUTPUT_DIRECTORY)
    # todo: load feature dimensions from profile
    nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean")
    print("loading trees...")
    nns_index.load(profile_folder + profile_name + ".tree")
    print("done.")
    subimage_index = pickle.load(
        open(profile_folder + profile_name + ".p", "rb"))
    template_image = Image.open(input_image)
    image_width, image_height = template_image.size[0], template_image.size[1]
    crop_width, crop_height = subimage_index[-1]["crop_width"], subimage_index[-1]["crop_height"]
    for i in xrange(version_count):
        print("Creating collage {}/{}...").format(i+1, version_count)
        output_image = template_image.copy()
        for x in xrange(0, image_width-crop_width, crop_width):
            for y in xrange(0, image_height-crop_height, crop_height):
                box = (x, y, x + crop_width, y + crop_height)
                crop_box = output_image.crop(box)
                crop_sample = crop_box.convert("LA").resize(SAMPLE_DIMENSION)
                gs_pixeldata = []
                for pixel in list(crop_sample.getdata()):
                    gs_pixeldata.append(pixel[0])
                image_neighbor = nns_index.get_nns_by_vector(gs_pixeldata, version_count)[i]
                substitute_image = Image.open(subimage_index[image_neighbor]["image"])
                substitute_crop = substitute_image.crop(
                subimage_index[image_neighbor]["box"])
                output_image.paste(substitute_crop, box)
        output_path = OUTPUT_DIRECTORY + str(i) + ".png"
        output_image.save(output_path, "PNG")
        print("done.")
    print("{} image(s) saved in {}").format(
            version_count, OUTPUT_DIRECTORY)
    return
Beispiel #36
0
def predict_type_embed_task(types_embed_array: np.array,
                            types_embed_labels: np.array,
                            type_space_labels: np.array, pred_task_idx: tuple,
                            indexed_knn: AnnoyIndex, k: int) -> List[dict]:
    def find_pred_task(i: int):
        if i < pred_task_idx[0]:
            return 'Parameter'
        elif i < pred_task_idx[1]:
            return 'Return'
        else:
            return 'Variable'

    pred_types: List[dict] = []
    # pred_types_embed = []
    # pred_types_score = []
    for i, embed_vec in enumerate(
            tqdm(types_embed_array,
                 total=len(types_embed_array),
                 desc="Finding KNNs & Prediction")):
        idx, dist = indexed_knn.get_nns_by_vector(embed_vec,
                                                  k,
                                                  include_distances=True)
        pred_idx_scores = compute_types_score(dist, idx, type_space_labels)

        pred_types.append({
            'original_type':
            types_embed_labels[i],
            'predictions':
            pred_idx_scores,
            'task':
            find_pred_task(i),
            'is_parametric':
            bool(re.match(r'(.+)\[(.+)\]', types_embed_labels[i]))
        })

        # pred_types_embed.append([i for i, s in pred_idx_scores])
        # pred_types_score.append(pred_idx_scores)

    return pred_types
Beispiel #37
0
    def log_topk_retrieval_acc(self, engine):
        """
        For tracking the performance during training top K Precision
        """
        train_embs, train_labels = extract_embeddings(self.model,
                                                      self.train_loader)
        val_embs, val_labels = extract_embeddings(self.model, self.val_loader)
        emb_dim = train_embs.shape[1]

        # ----------------------------------
        t = AnnoyIndex(emb_dim, metric='euclidean')
        n_trees = 100
        for i, emb_vec in enumerate(train_embs):
            t.add_item(i, emb_vec)
        # build a forest of trees
        t.build(n_trees)
        # ----------------------------------
        top_k_corrects = dict()
        # Meassure Prec@[5, 10, 20, 30]
        for i, emb_vec in enumerate(val_embs):
            correct_cls = val_labels[i]
            for k in [5, 10, 20, 30]:
                idx = t.get_nns_by_vector(emb_vec, k)
                top_k_classes = train_labels[idx]
                correct = np.sum(top_k_classes == correct_cls)
                accum_corr = top_k_corrects.get(k, 0)
                top_k_corrects[k] = accum_corr + correct
        # -------------------------------------------------
        # calculate back the acc
        top_k_acc = dict()
        for k in [5, 10, 20, 30]:
            top_k_acc[k] = top_k_corrects[k] / k / val_embs.shape[0]

        tqdm.write(
            "Top K Retrieval Results - Epoch: {}  Avg top-k accuracy:".format(
                engine.state.epoch))

        for k in [5, 10, 20, 30]:
            tqdm.write("  Prec@{} = {:.2f}".format(k, top_k_acc[k]))
Beispiel #38
0
def search_csv(csv_path='/midata/private/journal/files.csv',
               query='Misima island port harbor derelict ship PNG Papua New Guinnea Australis harbor storm sailing cliffs anchor drag',
               num_results=10, num_dims=300):
    df = pd.read_csv(csv_path, index_col=0)
    index_path = os.path.join(os.path.dirname(csv_path), 'files_index.ann')
    index = AnnoyIndex(f=num_dims)
    index.load(index_path)
    vec = nlp(query).vector
    paths = []
    for i in index.get_nns_by_vector(vec, num_results):
        path = df.iloc[i]['path']
        paths.append(path)
        print(path)
        with open(path, 'rb') as fin:
            bintext = b''.join(fin.readlines()[:10])
        try:
            text = bintext.decode()
        except UnicodeDecodeError:
            text = bintext.decode('latin')
        print(text)
        print('-' * 120)
    return
Beispiel #39
0
def nearest_approx(X, sites):
    from annoy import AnnoyIndex

    assert (X.shape[1] == sites.shape[1])

    # Build index over site points.
    aindex = AnnoyIndex(sites.shape[1], metric='manhattan')
    for i in range(sites.shape[0]):
        aindex.add_item(i, sites[i, :])
    aindex.build(max(10, int(np.log2(X.shape[0]))))

    site_to_idx = {site_idx: [] for site_idx in range(sites.shape[0])}

    for idx in range(X.shape[0]):
        # Find nearest site point.
        nearest_sites = aindex.get_nns_by_vector(X[idx, :], 1)
        if len(nearest_sites) < 1:
            continue
        site_idx = nearest_sites[0]
        site_to_idx[site_idx].append(idx)

    return site_to_idx
Beispiel #40
0
def feat_match(descs1, descs2):
    # Your Code Here
    n1, points1 = descs1.shape[:2]
    n2, points2 = descs2.shape[:2]
    t = AnnoyIndex(n1, metric="euclidean")

    for i in range(points2):
        t.add_item(i, descs2[:, i])
    t.build(50)

    matches = np.zeros((points1), dtype=int)

    for i in range(points1):
        p_index, dist = t.get_nns_by_vector(descs1[:, i],
                                            2,
                                            include_distances=True)
        if dist[0] / dist[1] < 0.6:
            matches[i] = p_index[0]
        else:
            matches[i] = -1

    return matches
Beispiel #41
0
class KNNIndex(object):
    annoy = None
    vec_len = -1
    metric = 'euclidean'
    is_loaded = False

    def __init__(self, vec_len, metric='euclidean', index_file=None):
        self.vec_len = vec_len
        self.metric = metric
        self.annoy = AnnoyIndex(self.vec_len, self.metric)
        if index_file:
            self.load(index_file)

    def get_nns_by_item(self, i, n, search_k=-1, include_distances=False):
        if self.is_loaded:
            return self.annoy.get_nns_by_item(i, n, search_k,
                                              include_distances)
        else:
            raise RuntimeError("Annoy index file is not loaded!")

    def get_nns_by_vector(self,
                          v,
                          n,
                          search_k=-1,
                          include_distances=False,
                          n_propagation=0):
        if self.is_loaded:
            return self.annoy.get_nns_by_vector(v, n, search_k,
                                                include_distances)
        else:
            raise RuntimeError("Annoy index file is not loaded!")

    def load(self, index_file):
        self.annoy.load(index_file)
        self.is_loaded = True

    def unload(self):
        self.annoy.unload()
        self.is_loaded = False
Beispiel #42
0
class VisBotTextBrain:
    def __init__(self, model_file, annoy_file):
        print("Loading w2v model...")
        self.model = KeyedVectors.load("./data/" + model_file)  # Word2Vec.load
        self.annoy = AnnoyIndex(self.model.wv.vector_size)
        print("Loading Annoy...")
        self.annoy.load("./data/" + annoy_file)

    def run(self, request=None):
        """
        :param: request: str value
        :return: list of indexes in DB
        """
        request_list = normalize_text(request).split(' ')

        if (len(request_list) == 1) and (request_list[0] == ''):
            raise ValueError('Incorrect request')

        vect_repr = self._get_vect_representation(request_list)

        self.request = request
        self.request_vector = vect_repr

        return self.annoy.get_nns_by_vector(vect_repr, n=100)

    def _get_vect_representation(self, request_list):
        vect_repr = []
        for word in request_list:
            try:
                vect_repr.append(self.model.wv[word])
            except KeyError:
                vect_repr.append([0] * 300)

        if vect_repr:
            vect_repr = np.mean(np.array(vect_repr), axis=0)
        else:
            vect_repr = np.array([0] * 300)

        return vect_repr
    def get_best_match(self, pred_vects, lbl_features, labels, k, expand=True, num_trees=100):
        ''''
        Goal: get 2 closest for every label
        if expand = True - Avoid using closest index  in image vectors for another label
        else: Can get same index for another label

        Method to get match. A little faster than original code
        '''
        t = AnnoyIndex(len(lbl_features[0]))
        for i in range(len(pred_vects)):
            t.add_item(i, pred_vects[i])
        t.build(num_trees)
        image_feature_inds = []
        final_lbl_featues = []
        temp = 0
        used_inds = {}
        for x in range(len(lbl_features)):
            if expand == False:
                temp = 2
            else:
                temp += k
            indices = t.get_nns_by_vector(lbl_features[x], temp, include_distances=False)
            add = 0
            for ind in indices:
                if (add > k): break
                if temp == len(pred_vects): # Edge case. If every index is used.
                    final_lbl_featues.append(lbl_features[x])
                    image_feature_inds.append(indices[0])
                    return image_feature_inds, final_lbl_featues
                if ind in used_inds and expand:
                    continue
                else:
                    if ind not in used_inds: used_inds[ind] = 1
                    #label_inds.append(x)
                    final_lbl_featues.append(lbl_features[x])
                    image_feature_inds.append(ind)
                    add += 1
        return image_feature_inds, final_lbl_featues
Beispiel #44
0
class face_annoy:
    def __init__(self):
        self.f = 512
        self.annoy_index_path = os.path.abspath(
            os.path.expanduser('~') + "/acs/data/face_vector.nn")
        self.num_trees = 100

        self.annoy = AnnoyIndex(self.f)
        if os.path.isfile(self.annoy_index_path):
            self.annoy.load(self.annoy_index_path)

    # 从lmdb文件中建立annoy索引
    def create_index_from_lmdb(self):
        # 遍历
        # lmdb_file = self.lmdb_file
        rows = dbsql.getallem()
        if len(rows) > 0:

            annoy = AnnoyIndex(self.f)
            for row in rows:
                key = row[0]
                value = str2embed(row[1])
                annoy.add_item(key, value)

            annoy.build(self.num_trees)
            annoy.save(self.annoy_index_path)

    # 重新加载索引
    def reload(self):
        self.annoy.unload()
        self.annoy.load(self.annoy_index_path)

    # 根据人脸特征找到相似的
    def query_vector(self, face_vector):
        n = 1
        return self.annoy.get_nns_by_vector(face_vector,
                                            n,
                                            include_distances=True)
Beispiel #45
0
    def find_article_by_text(self, title, annotation, key_words, checkbox):
        data_storage = {
            i[0]: i[1]['title'] + ' ' + i[1]['annotation']
            for i in self.data.iterrows()
        }
        map_id_2_prod_hash = pkl.load(
            open('map_id_to_hash_products.dict', 'rb'))

        index_title_emb = AnnoyIndex(100)
        index_title_emb.load('./annoy')
        model = Word2Vec.load('./w2v_products.w2v_gensim')

        app.logger.info('Запрос' + title)

        listik = (self.normalize_text(title) + ' ' +
                  self.normalize_annotation(annotation) + ' ' +
                  self.normalize_key_words(key_words)).split(' ')

        vec = np.zeros(100)
        for i in listik:
            part_of_vec = None
            try:
                part_of_vec = model[i]
            except KeyError:
                pass
            if part_of_vec is not None:
                vec += part_of_vec

        annoy_res = list(
            index_title_emb.get_nns_by_vector(vec, 13, include_distances=True))

        app.logger.info('Соседи:')
        listik = []
        for annoy_id, annoy_sim in itertools.islice(zip(*annoy_res), 13):
            image_id = map_id_2_prod_hash[annoy_id]
            listik.append(image_id)
            app.logger.info(data_storage[image_id], 1 - annoy_sim**2 / 2)
        return self.data.ix[listik]
Beispiel #46
0
def classify_cells(args, data_pt, all_sims_timepoints, ann_dir):
    n_neighbors = 10
    meta = data_pt["meta"]
    yc = data_pt["celltype"]
    xp_df = pd.DataFrame(data_pt["xp"], yc)
    u = AnnoyIndex(
        all_sims_timepoints[0][0].shape[1],
        'euclidean')  # all_sims_timepoints[0][0][0].shape[1], 'euclidean')
    u.load(ann_dir)
    yp_all = []
    for timepoint in all_sims_timepoints:
        yp = []
        for i in range(len(timepoint)):
            yt = []
            for j in range(len(timepoint[0])):
                nn = xp_df.iloc[u.get_nns_by_vector(timepoint[i][j],
                                                    n_neighbors)]
                nn = Counter(nn.index).most_common(2)
                label, num = nn[0]
                yt.append(label)
            yp.append(yt)
        yp_all.append(yp)
    return yp_all
class Annoy(AnnBase):
    def __init__(self, vector_len: int,
                 metric: str = 'angular', **kwargs):
        super().__init__(**kwargs)
        self.index = AnnoyIndex(vector_len, metric=metric)

    def build_index(self, num_trees: int = 30):
        for i, embed in enumerate(self.data):
            self.index.add_item(i, embed)
        self.index.build(num_trees)

    def search_vec_top_n(self, vector, n: int = 5):
        neighbours = self.index.get_nns_by_vector(vector, n)
        result = []
        for idx in neighbours:
            result.append(self.mapping[idx])
        return result

    def _load_file(self, path: str, **kwargs):
        self.index.load(path)

    def _save_file(self, path: str):
        self.index.save(path)
Beispiel #48
0
def label_approx(X, sites, site_labels, k=1):
    from annoy import AnnoyIndex

    assert (X.shape[1] == sites.shape[1])

    # Build index over site points.
    aindex = AnnoyIndex(sites.shape[1], metric='euclidean')
    for i in range(sites.shape[0]):
        aindex.add_item(i, sites[i, :])
    aindex.build(10)

    labels = []
    for i in range(X.shape[0]):
        # Find nearest site point.
        nearest_sites = aindex.get_nns_by_vector(X[i, :], k)
        if len(nearest_sites) < 1:
            labels.append(None)
            continue
        label = Counter([site_labels[ns]
                         for ns in nearest_sites]).most_common(1)[0][0]
        labels.append(label)

    return np.array(labels)
Beispiel #49
0
def thingR():
    a = request.args.get('url') + "?REQUEST=" + request.args.get(
        'REQUEST'
    ) + "&TIME=" + request.args.get('TIME') + "&BBOX=" + request.args.get(
        'BBOX') + "&CRS=" + request.args.get(
            'CRS') + "&LAYERS=" + request.args.get(
                "LAYERS") + "&WRAP=" + request.args.get(
                    "WRAP") + "&FORMAT=" + request.args.get(
                        'FORMAT') + "&WIDTH=" + request.args.get(
                            'WIDTH') + "&HEIGHT=" + request.args.get(
                                'HEIGHT') + "&ts=" + request.args.get('ts')

    u = AnnoyIndex(128)
    u.load('imageFeatSuhas.ann')
    indexes = u.get_nns_by_vector(eF(a, model2), 30, include_distances=True)
    j = ""
    for a in indexes[0]:
        p = a // 320
        o = a % 320
        j += (
            "https://gibs.earthdata.nasa.gov/wmts/epsg4326/best/MODIS_Terra_CorrectedReflectance_TrueColor/default/2012-07-09/250m/8/"
            + str(p + 100) + "/" + str(o) + ".jpg###")
    return j
Beispiel #50
0
def nn_annoy(ds1, ds2, names1, names2, knn = 20, metric='euclidean', n_trees = 50, save_on_disk = True):
    """ Assumes that Y is zero-indexed. """
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    if(save_on_disk):
        a.on_disk_build('annoy.index')
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((names1[a], names2[b_i]))

    return match
def ranking():
        with open('./docs/index_sen_20000.json','r') as file:
                index_sen = json.load(file)
        
        f = 768

        t = AnnoyIndex(768,'euclidean')
        t.load('./docs/sentence_embedding_20000_200.ann')

        query = input('Enter your sentence here')


        print('Top 3 Related  Questions:')
        encoder = SentenceTransformer('bert-base-nli-mean-tokens')
        query_vector = encoder.encode([query])
        query_vector_ls = query_vector[0].tolist()
        output = t.get_nns_by_vector(query_vector_ls,3,search_k =3, include_distances=True)
        output_sen1 = index_sen[str(output[0][0])]
        print(output_sen1)
        output_sen2 = index_sen[str(output[0][1])]
        print(output_sen2)
        output_sen3 = index_sen[str(output[0][2])]
        print(output_sen3)
Beispiel #52
0
    def find_nn(self, near_neigh=50):
        dim = joint_poses_btw_posture_in_plane_by_joint_pos(
            self.motions[0][0], self.motions[0][0]).shape[0]

        size = sum(map(len, self.motions))

        self.distance = np.zeros((size, size))
        data = []
        for i in range(len(self.motions)):
            for j in range(len(self.motions[i])):
                data.append(
                    joint_poses_btw_posture_in_plane_by_joint_pos(
                        self.motions[i][j], self.motions[0][0]))

        t = AnnoyIndex(dim, metric='euclidean')
        for i in range(size):
            t.add_item(i, data[i])

        t.build(20)

        for i in range(size):
            res, dist = t.get_nns_by_vector(data[i],
                                            near_neigh,
                                            include_distances=True)

            for j in range(near_neigh):
                if abs(i - res[j]) > 10:
                    self.distance[i, res[j]] = dist[j]
                    # TODO:
                    self.add_transition(
                        MotionTransition(0, i, 0, res[j], dist[j]))
                    # print(i, res[j], dist[j])

        self.transition.sort()
        for i in range(len(self.transition)):
            t = self.transition[i]
            print('check: ', t.motion_from_idx, t.motion_to_idx)
Beispiel #53
0
def get_mind_recall_res(user_embs, doc_embs, user_idx_2_rawid, doc_idx_2_rawid,
                        topk):
    """近邻检索,这里用annoy tree"""
    # 把doc_embs构建成索引树
    f = user_embs.shape[1]
    t = AnnoyIndex(f, 'angular')
    for i, v in enumerate(doc_embs):
        t.add_item(i, v)
    t.build(10)
    # 可以保存该索引树 t.save('annoy.ann')

    # 每个用户向量, 返回最近的TopK个item
    user_recall_items_dict = collections.defaultdict(dict)
    for i, u in enumerate(user_embs):
        recall_doc_scores = t.get_nns_by_vector(u,
                                                topk,
                                                include_distances=True)
        # recall_doc_scores是(([doc_idx], [scores])), 这里需要转成原始doc的id
        raw_doc_scores = list(recall_doc_scores)
        raw_doc_scores[0] = [doc_idx_2_rawid[i] for i in raw_doc_scores[0]]
        # 转换成实际用户id
        try:
            user_recall_items_dict[user_idx_2_rawid[i]] = dict(
                zip(*raw_doc_scores))
        except:
            continue

    # 默认是分数从小到大排的序, 这里要从大到小
    user_recall_items_dict = {
        k: sorted(v.items(), key=lambda x: x[1], reverse=True)
        for k, v in user_recall_items_dict.items()
    }

    # 保存一份
    pickle.dump(user_recall_items_dict, open('mind_u2i_dict.pkl', 'wb'))

    return user_recall_items_dict
Beispiel #54
0
class Annoy(ANN):
    """
    Builds an ANN model using the Annoy library.
    """
    def load(self, path):
        # Load index
        self.model = AnnoyIndex(self.config["dimensions"],
                                self.config["metric"])
        self.model.load(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "dot"

        # Create index
        self.model = AnnoyIndex(self.config["dimensions"],
                                self.config["metric"])

        # Add items
        for x in range(embeddings.shape[0]):
            self.model.add_item(x, embeddings[x])

        # Build index
        self.model.build(10)

    def search(self, query, limit):
        # Run the query
        ids, scores = self.model.get_nns_by_vector(query,
                                                   n=limit,
                                                   include_distances=True)

        # Map results to [(id, score)]
        return list(zip(ids, scores))

    def save(self, path):
        # Write index
        self.model.save(path)
def get_celeb_prediction(img, ann_filepath, celeb_mapping_path):
    ann_index = AnnoyIndex(2048, 'angular')
    _ = ann_index.load(ann_filepath)
    encs, bbox = get_encoding_new(img)
    data = []
    if encs is not None:
        for index, enc in enumerate(encs):
            cv2.rectangle(img, bbox[index], (255, 0, 0), 2)
            temp_data = {}
            temp_data["bbox"] = bbox[index]
            results = ann_index.get_nns_by_vector(enc[0],
                                                  10,
                                                  search_k=-1,
                                                  include_distances=True)
            dist_threshold = 0.9
            celeb_count_dict = get_celeb_name_from_id(results,
                                                      celeb_mapping_path,
                                                      dist_threshold)
            distance = results[1][0]
            if len(celeb_count_dict) != 0 and max(
                    celeb_count_dict.values()) > 3:
                celeb_name = max(celeb_count_dict, key=celeb_count_dict.get)
                cv2.putText(img, celeb_name.upper(),
                            (bbox[index][0] - 5, bbox[index][1] - 5),
                            cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)
                temp_data["celeb_name"] = celeb_name
                temp_data["confidence"] = face_distance_to_conf(distance)
            else:
                temp_data["celeb_name"] = "unknown"
                temp_data["confidence"] = 0.0
            data.append(temp_data)
        img = imutils.resize(img, width=400)
        # display(img)
        return data, img

    else:
        return None, None
Beispiel #56
0
print 'building index...'
ai.build(10)

print 'building up data points'
lons = np.arange(-180, 180, 0.25)
lats = np.arange(-90, 90, 0.25)
X, Y = np.meshgrid(lons, lats)
Z = np.zeros(X.shape)

count = 0
for i, _ in np.ndenumerate(Z):
    lon, lat = X[i], Y[i]

    v = ll_to_3d(lat, lon)

    js = ai.get_nns_by_vector(v, 50)
    all_ts = [ts[j] for j in js]
    cutoff = np.percentile(all_ts, 90)
    p = np.mean([t for t in all_ts if t < cutoff])
    p = np.clip(p, vmin, vmax)
    Z[i] = p
    count += 1
    if count % 1000 == 0:
        print count, np.prod(Z.shape)

print 'plotting'
maps = [
    ('nyc', (20, 20), basemap.Basemap(projection='ortho',lat_0=30,lon_0=-30,resolution='l')),
    ('asia', (20, 20), basemap.Basemap(projection='ortho',lat_0=23,lon_0=105,resolution='l')),
    ('world', (20, 10), basemap.Basemap(projection='cyl', llcrnrlat=-60,urcrnrlat=80,\
                                           llcrnrlon=-180,urcrnrlon=180,resolution='c'))
Beispiel #57
0
class AnnoyIndexer(object):

    def __init__(self, model=None, num_trees=None):
        self.index = None
        self.labels = None
        self.model = model
        self.num_trees = num_trees

        if model and num_trees:
            if isinstance(self.model, Doc2Vec):
                self.build_from_doc2vec()
            elif isinstance(self.model, Word2Vec):
                self.build_from_word2vec()
            else:
                raise ValueError("Only a Word2Vec or Doc2Vec instance can be used")

    def save(self, fname, protocol=2):
        fname_dict = fname + '.d'
        self.index.save(fname)
        d = {'f': self.model.vector_size, 'num_trees': self.num_trees, 'labels': self.labels}
        with smart_open(fname_dict, 'wb') as fout:
            _pickle.dump(d, fout, protocol=protocol)

    def load(self, fname):
        fname_dict = fname+'.d'
        if not (os.path.exists(fname) and os.path.exists(fname_dict)):
            raise IOError(
                "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict))
        else:
            with smart_open(fname_dict) as f:
                d = _pickle.loads(f.read())
            self.num_trees = d['num_trees']
            self.index = AnnoyIndex(d['f'])
            self.index.load(fname)
            self.labels = d['labels']

    def build_from_word2vec(self):
        """Build an Annoy index using word vectors from a Word2Vec model"""

        self.model.init_sims()
        return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word
                                      , self.model.vector_size)

    def build_from_doc2vec(self):
        """Build an Annoy index using document vectors from a Doc2Vec model"""

        docvecs = self.model.docvecs
        docvecs.init_sims()
        labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)]
        return self._build_from_model(docvecs.doctag_syn0norm, labels, self.model.vector_size)

    def _build_from_model(self, vectors, labels, num_features):
        index = AnnoyIndex(num_features)

        for vector_num, vector in enumerate(vectors):
            index.add_item(vector_num, vector)

        index.build(self.num_trees)
        self.index = index
        self.labels = labels

    def most_similar(self, vector, num_neighbors):
        """Find the top-N most similar items"""

        ids, distances = self.index.get_nns_by_vector(
            vector, num_neighbors, include_distances=True)

        return [(self.labels[ids[i]], 1 - distances[i] / 2) for i in range(len(ids))]
Beispiel #58
0
    print 'layer_size=',layer_size
    t.build(ntrees)
    t.save('index.ann')
    s.close()

if args.search:
    s = shelve.open('data.bin')
    u = AnnoyIndex(s['layer_size'],metric)
    u.load('index.ann')
    data = [args.search]
    classif = dd.post_predict(sname,data,parameters_input,parameters_mllib,parameters_output)
    # search for every roi
    res = classif['body']['predictions'][0]['rois']
    print('number of ROI in query: ' + str(len(res)))
    for roi in res:
        near = u.get_nns_by_vector(roi['vals'],args.search_size,include_distances=True)
        near_data = []
        near_distance = []
        for n in near[1]:
            near_distance.append(n)
        print('distances: ')
        print(near_distance)
        for n in near[0]:
            near_data.append(s[str(n)])
        # print query bbox
        img = cv2.imread(args.search)
        bbox = roi['bbox']
        cat = roi['cat']
        cv2.rectangle(img, (int(bbox['xmin']),int(bbox['ymax'])),(int(bbox['xmax']),int(bbox['ymin'])),(255,0,0),2)

        cv2.putText(img,cat,(int(bbox['xmin']),int(bbox['ymax'])),cv2.FONT_HERSHEY_PLAIN,1,255)
Beispiel #59
0
from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(f)  # Length of item vector that will be indexed
for i in xrange(1000):
    v = [random.gauss(0, 1) for z in xrange(f)]
    t.add_item(i, v)

t.build(10) # 10 trees
t.save('test.ann')

# ...

u = AnnoyIndex(f)
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors
item = u.get_item_vector(0)
print(u.get_nns_by_vector(item, 1000)) # will find the 1000 nearest neighbors
#print(len(u.get_nns_by_vector(item, 1000)))
#print(len(set(u.get_nns_by_vector(item, 1000))))
#print(len(u.get_nns_by_item(0, 1000)))
#print(len(set(u.get_nns_by_item(0, 1000))))
#if u.get_nns_by_vector(item, 1000) == u.get_nns_by_item(0, 1000):
#    print("SAME\n")
Beispiel #60
0
from annoy import AnnoyIndex

a = AnnoyIndex(3)
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)

print a.get_nns_by_item(0, 100)
print a.get_nns_by_vector([1.0, 0.5, 0.5], 100)