Ejemplo n.º 1
1
class ImageSearchAnnoyCombo:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'):
        #load h5 data
        h5f = h5py.File(h5fname,'r')
        self.X = h5f[dset]
        #load filenames
        with open(imageListPath,'r') as f:
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(self.X.shape[1],'angular')
        self.A.load(annf)

    def run_query_approx(self,query,n=100,accuracy_factor = 5):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)

    def run_query_exact(self,query,n=1000,nsmall=100):
        #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory
        #use Annoy
        if n < nsmall:
            n = nsmall
        indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False)
        indexes_sorted = sorted(indexes)
        #use scipy cdist (or normalize first and do dot product for faster computation)
        #getting X by index from disc is very slow. 
        distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0]
        ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n,
        s_ind = np.argsort(distance[ind])#sort 
        nearest = ind[s_ind]
        scoresorted = distance[ind][s_ind]
        return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)
Ejemplo n.º 2
0
def do(indextype):
    a = AnnoyIndex(8, indextype[0])
    a.load('points.%s.annoy' % indextype)
    with open('points.%s.ann.txt' % indextype, 'w') as out:
        for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]:
            nns = a.get_nns_by_item(q_index, 10)
            print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))
Ejemplo n.º 3
0
    def test_overwrite_index(self):
        # Issue #335
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)
        t.save('test.ann')

        # Load index file
        t2 = AnnoyIndex(f)
        t2.load('test.ann')

        # Overwrite index file
        t3 = AnnoyIndex(f)
        for i in range(500):
            v = [random.gauss(0, 1) for z in range(f)]
            t3.add_item(i, v)
        t3.build(10)
        if os.name == 'nt':
            # Can't overwrite on Windows
            with self.assertRaises(IOError):
                t3.save('test.ann')
        else:
            t3.save('test.ann')
            # Get nearest neighbors
            v = [random.gauss(0, 1) for z in range(f)]
            nns = t2.get_nns_by_vector(v, 1000)  # Should not crash
Ejemplo n.º 4
0
    def _get_index(self, dataset):
        url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
        vectors_fn = os.path.join('test', dataset + '.hdf5')
        index_fn = os.path.join('test', dataset + '.annoy')

        if not os.path.exists(vectors_fn):
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f
Ejemplo n.º 5
0
    def retrieve(self):

        print 'Loading necessary files..'
        u = AnnoyIndex(self.dim, metric='angular')
        u.load(index_file)

        print 'ANN Retrieval..'
        for n_neighbors in knns:
            print 'Number of neighbors: ' + str(n_neighbors)
            for mult in self.multipliers:
                print 'Multiplier: ' + str(mult)
                search_k = self.n_trees * n_neighbors * mult
                filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1]))
                with open(self.test_file, 'r') as data_file:
                    data = json.load(data_file)
                    qArray = []
                    for i in range(len(data["questions"])):
                        question_body = data["questions"][i]["body"]
                        question_id = data["questions"][i]["id"]
                        qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim)))

                        anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k)
                        doc_anns = []
                        for n in anns:
                            doc_anns.append(self.idmap[n])
                        q = Question(question_body, question_id, doc_anns)
                        qArray.append(q)
                    directory = "system_results/"
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile:
                        outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))
Ejemplo n.º 6
0
def build_annoy_index(corpus, dimension, winlen, winstep):
    print "Adding to Annoy index"
    index = AnnoyIndex(dimension, "euclidean")
    mfcc_list = []
    i = 0
    for filename, frames in corpus:
#        print filename, frames.shape
        for index_in_file, mfcc in enumerate(frames):
            mfcc_list.append((filename, index_in_file))
            index.add_item(i, mfcc.tolist())
            assert mfcc_list[i] == (filename, index_in_file)
            i += 1

    opts = {"samplerate": desired_samplerate,
            "winlen": winlen,
            "winstep": winstep,
            "numcep": 13,
            "nfilt": 26,
            "nfft": 512,
            "ntrees": ANN_NTREES
            }
    cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree"
    
    if not os.path.exists(cache_filename):
        print "Building Annoy index with %d trees" % ANN_NTREES
    #    index.build(-1)
        index.build(ANN_NTREES)
        index.save(cache_filename)
        print "\tWrote cache to %s" % cache_filename
    else:
        print "\tReading cache from %s" % cache_filename
        index.load(cache_filename)
    return index, mfcc_list
Ejemplo n.º 7
0
    def test_zero_vectors(self):
        # Mentioned on the annoy-user list
        bitstrings = [
            '0000000000011000001110000011111000101110111110000100000100000000',
            '0000000000011000001110000011111000101110111110000100000100000001',
            '0000000000011000001110000011111000101110111110000100000100000010',
            '0010010100011001001000010001100101011110000000110000011110001100',
            '1001011010000110100101101001111010001110100001101000111000001110',
            '0111100101111001011110010010001100010111000111100001101100011111',
            '0011000010011101000011010010111000101110100101111000011101001011',
            '0011000010011100000011010010111000101110100101111000011101001011',
            '1001100000111010001010000010110000111100100101001001010000000111',
            '0000000000111101010100010001000101101001000000011000001101000000',
            '1000101001010001011100010111001100110011001100110011001111001100',
            '1110011001001111100110010001100100001011000011010010111100100111',
        ]
        vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

        f = 64
        idx = AnnoyIndex(f, 'hamming')
        for i, v in enumerate(vectors):
            idx.add_item(i, v)

        idx.build(10)
        idx.save('idx.ann')
        idx = AnnoyIndex(f, 'hamming')
        idx.load('idx.ann')
        js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
        self.assertEquals(js[0], 0)
        self.assertEquals(ds[:4], [0, 1, 1, 22])
Ejemplo n.º 8
0
def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"):
    index = AnnoyIndex(df.shape[1])
    index.load(index_file)
    patient_dict = {}
    for key, val in csv.reader(open(patient_dict_file)):
        patient_dict[key] = int(val)
    index_dict = {}
    for key, val in csv.reader(open(index_dict_file)):
        index_dict[int(key)] = val
    print("Computing nearest-neighbors...")
    neighbor_dict = {}
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False)
        neighbor_ids = [index_dict[x] for x in neighbors]
        neighbor_dict[patient_id] = neighbor_ids
    f = open(out_dir+"patient_walks.txt", 'wb')
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        patient_sentences = ""
        for j in range(walks_per_patient):
            sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict,
                                        n_neighbors=n_neighbors,walk_size=walk_size)
            patient_sentences = sentence + "\n"
            ## Write it ##
        f.write(patient_sentences)
Ejemplo n.º 9
0
 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)
Ejemplo n.º 10
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
class FeatureNN:
    tree = None

    def __init__(self, features, tree_file):
        self.tree = AnnoyIndex(features, metric='euclidean')
        self.tree.load(str(tree_file))

    def nn(self, x):
        return self.tree.get_nns_by_vector(x.tolist(), 1)[0]
def main():

    # Annoy Vector Dimension
    vec_dimension =100

    models_dir = "/raid/ankit/ann_models/"
    start = time.time()
    print "Starting: Loading of memory mapped models ... "
    # Load all models - memory mapped - quick
    ann1 = AnnoyIndex(vec_dimension)
    ann1.load(models_dir+"model10_split1.ann")

    ann2 = AnnoyIndex(vec_dimension)
    ann2.load(models_dir+"model10_split2.ann")

    ann3 = AnnoyIndex(vec_dimension)
    ann3.load(models_dir+"model10_split3.ann")

    ann4 = AnnoyIndex(vec_dimension)
    ann4.load(models_dir+"model10_split4.ann")

    ann5 = AnnoyIndex(vec_dimension)
    ann5.load(models_dir+"model10_split5.ann")
    end =time.time()

    print "All annoy-lsh models loaded! Time Taken: "+str((end-start)/60)+ " minutes."


    print "\nSimilar Queries - LSH Interface [All Top Queries]"
    print "----------------------------------------------------"

    flag = "True"
    while (flag == "True"):
        testquery = raw_input("Enter Query: ")
        nearest_num = raw_input("Number of similar queries: ")
        if nearest_num == 0 or nearest_num == "":
            nearest_num = 10
        nearest_num = int(nearest_num)
        if not testquery.strip() =="":
            lsh_list_n = get_similar_queries(testquery.strip(), nearest_num, ann1, ann2, ann3, ann4, ann5)

            # Return and Print the Top 10 nearest Queries to the Original Query
            print "\nCandidate Nearest Queries [TOP 10]: "
            count = 0
            for query,distance in lsh_list_n:
                if count == nearest_num:
                    break
                print str(query)+"\t"+str(distance)
                count+=1

            user_input = raw_input("\nDo you wish to continue again? (Type 'no' to quit): ")
            if user_input == "no":
                print "\nGoodbye!"
                break
            else:
                print "\n"
                continue
Ejemplo n.º 13
0
 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
Ejemplo n.º 14
0
 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('x.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)
Ejemplo n.º 15
0
def get_tree_index(metric='angular', size=4096):
    '''
    INPUT: Optional parameters for the metric space and size of AnnoyIndex 
    OUTPUT: AnnoyIndex tree, dictionary of node assignment to image names
    '''
    tree = AnnoyIndex(size, metric=metric)
    tree.load(DATA_DIR + 'tree_' + metric + '.ann')

    with open(DATA_DIR + 'indexes_' + metric, 'rb') as f:
        indexes = pickle.load(f)

    return tree, indexes
Ejemplo n.º 16
0
 def test_on_disk(self):
     f = 2
     i = AnnoyIndex(f, 'euclidean')
     i.on_disk_build('on_disk.ann')
     self.add_items(i)
     i.build(10)
     self.check_nns(i)
     i.unload()
     i.load('on_disk.ann')
     self.check_nns(i)
     j = AnnoyIndex(f, 'euclidean')
     j.load('on_disk.ann')
     self.check_nns(j)
Ejemplo n.º 17
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    f = data.base.shape[1]

    for ntrees in args.ntrees:
        t = AnnoyIndex(f)   # Length of item vector that will be indexed
        idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees)
        if not os.path.exists(idxpath):
            logging.info("Adding items ...")
            for i in xrange(data.nbae):
                t.add_item(i, data.base[i])
                if i % 100000 == 0:
                    logging.info("\t%d/%d" % (i, data.nbae))
            logging.info("\tDone!")
            logging.info("Building indexes ...")
            t.build(ntrees)
            logging.info("\tDone!")
            t.save(idxpath)
        else:
            logging.info("Loading indexes ...")
            t.load(idxpath)
            logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Ejemplo n.º 18
0
def load_index(path_index: PathType,
               meta_d: Dict) \
        -> AnnoyIndex:
    """ We rely on ANNOY's usage of mmap to be fast loading
    (fast enough that we can load it on every single call)
    """
    n_dim = meta_d['n_dim']
    metric = meta_d['metric']
    u = AnnoyIndex(
        n_dim,
        metric=metric,
    )
    u.load(str(path_index))
    u.set_seed(SEED)
    return u
Ejemplo n.º 19
0
class AnnoyLookup(object):
    def __init__(self, metadata_path):
        with open(os.path.join(metadata_path, "metadata.json")) as f:
            self._data = json.load(f)
        self._index = AnnoyIndex(self._data["feature_length"],
                                 metric="angular")
        self._index.load(os.path.join(metadata_path, "index.ann"))

    def get_neighbours(self, embedding, max_neigh=3):
        items, distances = self._index.get_nns_by_vector(
            embedding, max_neigh, include_distances=True)
        zipped = zip(items, distances)
        sorted_list = sorted(zipped, key=lambda t: t[1])
        return [(self._data["filenames"][idx], distance)
                for idx, distance in sorted_list]
Ejemplo n.º 20
0
 def test_save_load(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     i.save('blah.ann')
     j = AnnoyIndex(f, 'hamming')
     j.load('blah.ann')
     rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
Ejemplo n.º 21
0
class _Annoy(object):
    def __init__(self, feature):
        model_path = 'model.ann'.format()
        n_dim = feature.shape[1] * feature.shape[2]
        feature = feature.reshape(feature.shape[0], n_dim)
        self.t = AnnoyIndex(n_dim, 'angular')
        if not os.path.exists(model_path):
            for i, f in enumerate(tqdm(feature)):
                # normarize
                v = f / np.sum(f)
                self.t.add_item(i, v)
            self.t.build(10)
            self.t.save(model_path)
        else:
            self.t.load(model_path)
Ejemplo n.º 22
0
def createAnnoyIndex(d, targetPoints, n_trees):
    #create AnnoyIndex in R^(2*d)
    targetIndex = AnnoyIndex(2 * d, metric='euclidean')
    #add each of the projected target points
    for i in range(targetPoints.shape[0]):
        targetIndex.add_item(i, projectToTorus(targetPoints[i]))

    #build the LSH-forest with the target points
    targetIndex.build(n_trees)

    #save and load with memory map
    targetIndex.save("LSHForest.ann")
    loadedIndex = AnnoyIndex(2 * d, metric='euclidean')
    loadedIndex.load("LSHForest.ann")
    return loadedIndex
Ejemplo n.º 23
0
 def test_save_load(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     i.save('blah.ann')
     j = AnnoyIndex(f, 'hamming')
     j.load('blah.ann')
     rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u - v, u - v))
Ejemplo n.º 24
0
def get_nn_by_name(name):
    Session = sessionmaker(bind=engine)
    session = Session()
    # This command should be used to
    name_string = "%{0}%".format(name)
    #     print(name_string)
    result = session.query(annoy_table).filter(
        annoy_table.c.name.like(name_string)).first()
    u = AnnoyIndex(f)
    u.load('../../test.ann')  # super fast, will just mmap the file
    list_of_near = u.get_nns_by_item(result[1],
                                     4)  # will find the 5 nearest neighbors
    # remove current node
    list_of_near.remove(result[1])
    return list_of_near
Ejemplo n.º 25
0
class ANN:
    def __init__(self, dimension):
        self.ann = AnnoyIndex(dimension)
    def addVectors(self,vectors):
        for idx,v in enumerate(vectors):
            self.ann.add_item(idx,v)
        self.ann.build(10)
    def query(self,vector):
        match = self.ann.get_nns_by_vector(vector,1)[0]
        # return self.ann.get_item_vector(match),match
        return match
    def save(self):
        self.ann.save("analogies.ann")
    def load(self,filename):
        self.ann.load(filename)
Ejemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(description='recommend system')
    parser.add_argument('--query',
                        '-q',
                        type=str,
                        default="",
                        help='query image path')
    parser.add_argument('--bbox',
                        '-b',
                        type=str,
                        default="",
                        help='bbox image')
    parser.add_argument('--genre',
                        '-g',
                        type=str,
                        default="tops",
                        help='genre')
    args = parser.parse_args()

    if args.query == "":
        raise ("")
    genre = args.genre

    data_path = []
    base = os.path.dirname(os.path.abspath(__file__))
    list_path = os.path.normpath(os.path.join(base, './img_list.txt'))
    with open(list_path, "r") as f:
        for line in f.readlines():
            data_path.append(line.rstrip())
    annoy_model = AnnoyIndex(256)
    annoy_model.load(base + "/{}.ann".format(genre))

    query_path = args.query
    bbox = [int(item) for item in args.bbox.split(",")]

    query_img = utils.read_image(query_path, color=True)
    croped_query_img = crop_img(query_img, bbox)
    comparing_hist = cv2.calcHist([croped_query_img], [0], None, [256],
                                  [0, 256])
    predict_indexes = annoy_model.get_nns_by_vector(comparing_hist,
                                                    5,
                                                    search_k=-1)
    predict_indexes = [
        data_path[idx].split("\\")[-1] for idx in predict_indexes
    ]
    #with open("recommend_image.json", "w") as f:
    json_data = json.dumps(predict_indexes)
    print(json_data)
Ejemplo n.º 27
0
def closest_topK(unseen_event, concept_embedding, concept_mapping, dim, topK=10, unseen_id=None):
    """
    unseen_event: (title: str, description: str)
    concept_embedding: {word_id : [emb]}
    concept_mapping: {word_id : word_string}
    """
    unseen_event_title_tags = jieba.analyse.extract_tags(unseen_event[0])

    # Switch textrank or embedrank
    if ARGS.embedrank:
        unseen_event_description_words = embedrank_getkeywords(unseen_event[1])
    elif ARGS.tfidf:
        unseen_event_description_words = tfidf_getkeywords(unseen_event[1])
    else:
        unseen_event_description_words = textrank_getkeywords(unseen_event[1])

    print('title words:', unseen_event_title_tags)
    print('description words:', unseen_event_description_words)
    keywords = [*unseen_event_title_tags, *unseen_event_description_words]

    # INVOLVE GENERE
    # try:
    #     for word in GENERE_TO_KEYWORDS[ID_TO_GENERE[unseen_id]]:
    #         if word not in keywords:
    #             keywords.append(word)
    # except KeyError:
    #     pass
    ### END OF INVOLVING GENERE

    print("keywords", keywords)
    # Generate the label embedding for a new item
    event_concept_embeddings = []
    for word in keywords:
        try:
            event_concept_embeddings.append(concept_embedding[concept_mapping[word]])
        except KeyError:
            continue
    unseen_event_vector = [ sum(value) / len(value) for value in  zip(*event_concept_embeddings)]
    if unseen_event_vector == []:
        unseen_event_vector = [0] * dim
    annoy_index = AnnoyIndex(dim)
    annoy_index.load('cc2vec_textrank.ann')
    # Find topK colest item according to the label embedding
    ranking_list = annoy_index.get_nns_by_vector(unseen_event_vector, 10, search_k=-1, include_distances=True)
    propgation_list = []
    for id_, score in zip(ranking_list[0], ranking_list[1]):
        propgation_list.append((id_, score))
    return unseen_event_vector, propgation_list
Ejemplo n.º 28
0
def get_top_k_tables(sample_info_dict, id_to_index, index_file, dim, k):
    u = AnnoyIndex(dim, 'angular')
    u.load(index_file)
    ranks, top_k = [], {}
    for sentence, info in sample_info_dict.items():
        table_id, embedding = info['table_id'], info['embedding']
        table_index = id_to_index[table_id]
        closest_tables = u.get_nns_by_vector(embedding, 1000000)
        rank = closest_tables.index(table_index)
        if rank < k:
            label = [0 for _ in range(k)]
            label[rank] = 1
            info['top_k'] = closest_tables[:k]
            info['labels'] = label
        ranks.append(rank)
    return ranks
Ejemplo n.º 29
0
    def debug():

        f = 40
        t = AnnoyIndex(f)  # Length of item vector that will be indexed
        for i in xrange(1000):
            v = [random.gauss(0, 1) for z in xrange(f)]
            t.add_item(i, v)

        t.build(10)  # 10 trees
        t.save('test.ann')

        # ...
        u = AnnoyIndex(f)
        u.load('test.ann')  # super fast, will just mmap the file
        print(u.get_nns_by_item(0,
                                1000))  # will find the 1000 nearest neighbors
Ejemplo n.º 30
0
    def load_index(self, index_id):
        if self.annoy_index is None:
            log.info("loading initial index with id {}", self.current_index)
        else:
            log.info("switching index from {} to {}", self.current_index,
                     index_id)

        newindex = AnnoyIndex(108, metric='euclidean')
        newindex.load(config.index_config['index_path'] + 'index_' +
                      str(index_id) + '.ann')
        if self.annoy_index is not None:
            self.annoy_index.unload()
        self.annoy_index = newindex
        self.current_index = index_id
        log.info("finished switching index. now using index {}",
                 self.current_index)
Ejemplo n.º 31
0
def test_build_sparse_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    sparse_data = csr_matrix(data)

    index = build_annoy_index(sparse_data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()
Ejemplo n.º 32
0
    def _load_index(self, wherefrom, index_key):
        """Load an AnnoyIndex from disk"""
        est = self.estimator_

        # I can't think of anything more clever because I've been up for
        # hours and hours and hours, so this is the kludgiest solution:
        if index_key == "similar_items_index":
            n_index = est.item_factors.shape[1]
        # Otherwise, "recommend_index"
        else:
            # This assumes approximate_recommend, since it's the only way
            # it will ever get to this code
            n_index = est.extra_
        index = AnnoyIndex(n_index, "angular")
        index.load(join(wherefrom, index_key))
        return index
Ejemplo n.º 33
0
 def run(self):
     try:
         index = AnnoyIndex(self.n_dims, metric='euclidean')
         index.load(self.index_filepath)
         for i in range(self.data_indices[0], self.data_indices[1]):
             neighbour_indexes = index.get_nns_by_vector(self.X[i,:]
                 , self.k, search_k=self.search_k, include_distances=False)
             neighbour_indexes = np.array(neighbour_indexes,
                                             dtype=np.uint32)
             self.results_queue.put(
                 IndexNeighbours(row_index=i,
                                 neighbour_list=neighbour_indexes))
     except Exception as e:
         self.exception = e
     finally:
         self.results_queue.close()
Ejemplo n.º 34
0
    def test_on_disk(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.on_disk_build('test.ann')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])

        i.build(10)
        i.unload()

        i.load('test.ann')

        self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
Ejemplo n.º 35
0
    def test_celeba_embedding(self):
        PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json')))

        EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json')))


        INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann')))

        NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw')))

        TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME',
            os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json')))

        with open(PATHS_JSON, 'r') as fp:
            print('Loading paths')
            paths = np.array(json.load(fp))
        with open(EMBEDDING_JSON, 'r') as fp:
            print('Loading embeddings')
            embeddings = json.load(fp)

        with open(TEST_CASES_FILENAME, 'r') as fp:
            print('Loading test_cases')
            test_cases = json.load(fp)


        annoy = AnnoyIndex(len(embeddings[0]))    
        annoy_index = annoy.load(INDEX_FILENAME)

        print('building nsw index')
        nsw_index = PyNSW('l2')
        print('Creating nodes')
        nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
        print('Inserting nodes')
        for node in tqdm(nodes):
            nsw_index.nn_insert(node, 5, 1000)

        n, k_annoy, k_nsw = 0, 0, 0

        print('Calculating accuracy on CelebA')

        for tk in test_cases:
            vector = embeddings[int(tk['embedding_index'])]
            
            closest_paths_real = tk['closest_paths_real']

            closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)]

            closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)]

            assert len(closest_paths_real) == 10
            assert len(closest_paths_annoy) == 10
            assert len(closest_paths_nsw) == 10

            n += 10
            k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real))
            k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real))


        print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n))
        print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))
Ejemplo n.º 36
0
class ImageSearchAnnoy:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,dimensions,annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt'):
        #load filenames
        with open(imageListPath,'r') as f:
            #self.line_to_file = {i:line.split('/')[-1].rstrip() for i,line in enumerate(f)}
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(dimensions,'angular')
        self.A.load(annf)

    def run_query(self,query,n=100,accuracy_factor = 2):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)
Ejemplo n.º 37
0
 def test(self):
     # feat=np.random.random((100000,4096))
     # annoyIndex = AnnoyIndex(4096)
     # annoyIndex.on_disk_build('a')
     # for i,v in enumerate(feat):
     #     annoyIndex.add_item(i,v)
     # for i,v in enumerate(feat):
     #     annoyIndex.add_item(i,v)
     t = time.time()
     #
     # annoyIndex.build(100)
     # print(time.time()-t)
     annoyIndex = AnnoyIndex(4096)
     annoyIndex.load('a')
     print(annoyIndex.get_nns_by_item(0, 5))
     print(time.time() - t)
Ejemplo n.º 38
0
def init():
    global indices
    indices = defaultdict(lambda: defaultdict(dict))
    for dim in ['matrix', 'tensor']:
        for size in [500, 5000]:#, 1000, 5000, 10000]:
            folder = 'data/' + dim + '/200x' + str(size)
            # loading the index
            t = AnnoyIndex(20, 'angular')
            t.load(folder + '/embd.ann')
            indices[dim][size]['index'] = t
            # loading the extractions
            exts = pd.read_csv(folder + '/extr_index.csv')
            ext2idx = dict(zip((x['modifier'] + ';' + x['aspect'] \
                                for _, x in exts.iterrows()), range(len(exts))))
            indices[dim][size]['exts'] = exts
            indices[dim][size]['ext2idx'] = ext2idx
Ejemplo n.º 39
0
def load_indexes(ann_filepath=None, celeb_mapping_path=None):
    home = expanduser("~")
    if ann_filepath is None:
        ann_filepath = os.path.join(home, 'celeb_index_60.ann')
        celeb_ann_id = '1-3Wb7fiINbrk9FSagTxjLdSjp7KzrMp7'
        if not os.path.exists(ann_filepath):
            download_file_from_google_drive(celeb_ann_id, ann_filepath)

    if celeb_mapping_path is None:
        celeb_mapping_path = os.path.join(home, 'celeb_mapping.json')
        celeb_mapping_file_id = '1wDaaSQ6NjxLkxpzYyTRknefizZUKnKDj'
        if not os.path.exists(celeb_mapping_path):
            download_file_from_google_drive(celeb_mapping_file_id,
                                            celeb_mapping_path)

    ann_index = AnnoyIndex(2048, 'angular')
    _ = ann_index.load(ann_filepath)

    with open(celeb_mapping_path) as json_file:
        celeb_mapping_temp = json.load(json_file)
    celeb_mapping_dict = {}
    for key, value_list in celeb_mapping_temp.items():
        for each_id in value_list:
            celeb_mapping_dict[each_id] = str(key)

    return ann_index, celeb_mapping_dict
def fetch_topK_similar(items_vec_file, ann_model_file, dim, topK, item_idx_map, items_list_batch, ddb_table, company_label):
    b_time = time.time()
    log.debug("[fetch_topK_similar] Start to get topK items")
    ann_model = AnnoyIndex(dim, 'angular')
    ann_model.load(ann_model_file)
    update_data = {}
    items_set = set([item for sublist in items_list_batch for item in sublist])
    print(items_list_batch)
    print(items_set)
    with open(items_vec_file, 'r') as in_f:
        num_items, dim = in_f.readline().strip().split()
        for idx, line in enumerate(in_f):
            tmp = line.split()
            item_id = tmp[0]
            if item_id in items_set:
                action, content_id = item_id.split(':', 1)
                item_emb = list(map(float, tmp[1:]))
                if item_label not in update_data:
                    update_data[item_label] = {'item_id': item_label, 'label': company_label}

                res_dict = OrderedDict()
                topK_item, topK_dist = ann_model.get_nns_by_vector(item_emb, topK*3, include_distances=True)
                for item_idx, dist in zip(topK_item, topK_dist):
                    try:
                        item = item_idx_map[item_idx].split(':', 1)[1].strip()
                        if item not in res_dict:
                            res_dict[item] = Decimal(f"{1-dist:.4f}")
                            # Todo: maybe do score normalize here
                    except Exception as err:
                        log.error(err)
                        log.warning(f"Couldn't find item name : {item_idx_map[item_idx]}")
                    if len(res_dict) == topK:
                        break

                if action == Action.View.value:
                    update_data[item_label]['view_similar'] = res_dict
                elif action == Action.AddToCart.value:
                    update_data[item_label]['add_cart_similar'] = res_dict
                elif action == Action.Purchase.value:
                    update_data[item_label]['purchase_similar'] = res_dict
                else:
                    log.warning(f"{e} -> {action} not a valided action...")
                    continue

    log.debug(f"[Time|fetch_topK_similar] Cost : {time.time() - b_time}")
    if len(update_data) > 0:
        insert_ddb(ddb_table, company_label, update_data)
Ejemplo n.º 41
0
class ChexSearch(object):
    """ Searches Chex index for game states and associated games. """

    #TODO: Combine results of board transforms with binary search algo.

    def __init__(self, chex_index, results=10, search_k=40):
        self.chex_index = chex_index
        self.results = results
        self.search_k = search_k
        self.annoy_index = AnnoyIndex(_bitboard_length, metric='angular')
        self.annoy_index.load(os.path.join(self.chex_index, 'annoy.idx'))
        self.chex_sql = SqliteDict(os.path.join(self.chex_index, 'sqlite.idx'))

    def search(self, board):
        """ Searches for board.

            board: game object of type chess.Board

            Return value: [
                (board, similarity score, [(game_id, move number), ...]), ...]
        """

        symmetrical_boards = [
            board_to_bitboard(board),
            invert_board(board),
            flip_board(board),
            reverse_and_flip(board)
        ]
        results = []
        for bitboard in symmetrical_boards:
            for annoy_id, similarity in zip(
                    *self.annoy_index.get_nns_by_vector(
                        bitboard, self.results, include_distances=True)):
                # Recompute ASCII key
                bitboard = self.annoy_index.get_item_vector(annoy_id)
                to_unhexlify = '%x' % int(
                    ''.join(map(str, map(int, bitboard))), 2)
                try:
                    key = binascii.unhexlify(to_unhexlify)
                except TypeError:
                    key = binascii.unhexlify('0' + to_unhexlify)
                results.append((bitboard_to_board(bitboard), similarity,
                                self.chex_sql[key]))
        return results

    def close(self):
        del self.annoy_index
Ejemplo n.º 42
0
def main():
    # mnist画像の読み込み処理
    train_imgs, train_lbls, test_imgs, test_lbls = load_mnist()
    print(train_imgs.shape, train_lbls.shape, test_imgs.shape, test_lbls.shape)

    if not os.path.isfile('./static/mnist_db.ann'):
        make_annoy_db(train_imgs)  # annoydbのビルド
    annoy_db = AnnoyIndex((28 * 28), metric='euclidean')
    annoy_db.load('./static/mnist_db.ann')  # annoyのデータベースをロードする

    # テストデータを入力して近い近傍を取ってきて実際と比較することで試しに精度をみてみる
    y_pred = [
        train_lbls[annoy_db.get_nns_by_vector(test_img.flatten(), 1)[0]]
        for test_img in test_imgs
    ]
    score = accuracy_score(test_lbls, y_pred)
    print('acc:', score)
Ejemplo n.º 43
0
def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if (os.path.isfile(saving_model)):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ['latitude', 'longitude', 'time_period']])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return (t)
Ejemplo n.º 44
0
    def test1(self):
        rows = self.query_country_name('%')
        annoyIndex = AnnoyIndex(768)
        # for i,row in enumerate(rows):
        #     encode=self.bc.encode([row[1]])
        #     annoyIndex.add_item(i,encode[0])
        # annoyIndex.build(10)
        # annoyIndex.save('articles')
        annoyIndex.load('articles')
        result, index = annoyIndex.get_nns_by_item(10,
                                                   5,
                                                   include_distances=True)
        print(rows[10])
        print(np.cos(index))
        for i in result:

            print(rows[i])
Ejemplo n.º 45
0
    def get_similar_items(self, product_id: int,
                          rec_type: int) -> pd.DataFrame:
        '''
        Function that creates recommendation lists.

        The intuition behind using less components is reducing the number of latent factors
        that can be inferred. And, by excluding item features for the CAB model, recommendations
        will be less based off explicit features such as `aisle` and `department`.
        -------------------
        type:
        1 - Similar Items [DEFAULT_PARAMS]
        2 - Complement Items [CAB_PARAMS]
        '''
        logging.info(
            f'Logging recommendations for {self.model.config.ANNOY_PARAMS[rec_type]}'
        )
        if rec_type == 1:
            annoy_model = AnnoyIndex(
                self.model.config.LIGHTFM_PARAMS['no_components'])
            annoy_model.load(self.config.PATHS.models + '/item.ann')
        elif rec_type == 2:
            annoy_model = AnnoyIndex(
                self.model.config.LIGHTFM_CAB_PARAMS['no_components'])
            annoy_model.load(self.config.PATHS.models + '/item_cab.ann')
        similar_variants = annoy_model.get_nns_by_item(
            product_id,
            self.model.config.ANNOY_PARAMS['nn_count'],
            search_k=-1,
            include_distances=False)

        logging.info(type(similar_variants))
        logging.info(similar_variants)
        similar_variants_df = self.item_df.iloc[similar_variants, :]

        similarVariantsTable = PrettyTable(
            ['product_id', 'product_name', 'aisle', 'department', 'num'])
        similarVariantsTable.add_row([
            similar_variants_df['product_id'],
            similar_variants_df['product_name'], similar_variants_df['aisle'],
            similar_variants_df['department'], similar_variants_df['num']
        ])
        logging.info(
            f'{self.model.config.ANNOY_PARAMS[rec_type]} Data: \n{similarVariantsTable}'
        )

        return similar_variants_df
Ejemplo n.º 46
0
def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if os.path.isfile(saving_model):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return t
Ejemplo n.º 47
0
def read_annoy(embedding):
    f = 64  # Embedding size
    u = AnnoyIndex(f, 'euclidean')
    u.load('./neural_networks/models/saved_annoy.ann'
           )  # super fast, will just mmap the file
    n = 1  # Num neighbors
    neighbors = []
    distances = []
    for emb in embedding:
        neighbor, dist = u.get_nns_by_vector(emb,
                                             n,
                                             search_k=-1,
                                             include_distances=True)
        neighbors.append(neighbor)
        distances.append(dist)

    return neighbors, distances
 def similarity_search(image, num_closest_items, hash_table_file_path,
                       image_hash_file_path):
     """Input: image  Output: a list of images similar to the input
        Get the feature set associated with the image.
        Use feature set to query the ANNoy hashmap.
     """
     graph = create_graph(model_path)
     features = get_features_from_graph(graph, image_path)
     hash_table = AnnoyIndex(len(features))
     hash_table.load(hash_table_file_path)
     image_hash_table = pickle.load(image_hash_file_path)
     for i in xrange(len(features)):
         search_results = hash_table.get_nns_by_vector(
             features[i], num_closest_items, include_distances=True)
         file_path = image_hash_table[
             search_results]  #Translate integer into image path.
         print(search_results)
Ejemplo n.º 49
0
class Annoy(ANN):
    """
    Builds an ANN model using the Annoy library.
    """
    def load(self, path):
        # Load index
        self.model = AnnoyIndex(self.config["dimensions"],
                                self.config["metric"])
        self.model.load(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "dot"

        # Create index
        self.model = AnnoyIndex(self.config["dimensions"],
                                self.config["metric"])

        # Add items
        for x in range(embeddings.shape[0]):
            self.model.add_item(x, embeddings[x])

        # Build index
        self.model.build(self.setting("ntrees", 10))

    def search(self, queries, limit):
        # Lookup search k setting
        searchk = self.setting("searchk", -1)

        # Annoy doesn't have a built in batch query method
        results = []
        for query in queries:
            # Run the query
            ids, scores = self.model.get_nns_by_vector(query,
                                                       n=limit,
                                                       search_k=searchk,
                                                       include_distances=True)

            # Map results to [(id, score)]
            results.append(list(zip(ids, scores)))

        return results

    def save(self, path):
        # Write index
        self.model.save(path)
Ejemplo n.º 50
0
class FoodGetter:
    def __init__(self):
        pass

    def load(self, data_path, doc2vec_path, annoy_path):
        """
            Обучатся не в состоянии, 
            можно только загрузить готовые модельки с диска
            
            Загружается секунд 5 - 7, это нормально
        """
        self.data = pd.read_csv(data_path)
        self.data.ingredients = self.data.ingredients.apply(eval)
        self.data.steps = self.data.steps.apply(eval)

        self.doc2vec_model = gensim.models.doc2vec.Doc2Vec.load(doc2vec_path)
        self.length = len(self.doc2vec_model.infer_vector([" "]))

        self.annoy_model = AnnoyIndex(self.length, 'angular')
        self.annoy_model.load(annoy_path)

    def find(self, _input, N=5):
        """
            На вход: строка с ингредиентами
            Выход: генерируем кортеж из имени, список ингредиентов и индекс
            Если ничего не найдено возвращается пустой массив
        """
        _input = _input.split(" ")
        _res = set(_input)
        idx = self.annoy_model.get_nns_by_vector(
            self.doc2vec_model.infer_vector(_input), 1000, search_k=2000)
        res = filter(
            lambda index: (lambda x: len(_res & x) / len(x) > 0.55)
            (set(self.data.ingredients[index])), idx)
        ans = []
        for i, index in enumerate(res):
            if i == N:
                return ans
            temp = self.data.loc[index]
            ans.append((temp["name"], " ".join(temp.ingredients), index))
        return ans

    def get_steps(self, idx):
        "Для понравившегося индекса возвращаем инструкцию по приготовлению и номер шага"
        return enumerate(self.data.loc[idx].steps)
Ejemplo n.º 51
0
    def load(self):
        self.prev_id = -1
        self.indexes = []
        logger.info("Loading index {0}".format(self.actor_urn))
        for index in self.indexes:
            index.unload()

        for f in sorted(listdir(self.index_dir)):
            if f.endswith(".ann"):
                self.index_files.append(join(self.index_dir,f))
                index = AnnoyIndex(self.feat_size, metric='euclidean')
                index.load(join(self.index_dir, f))
                self.indexes.append(index)
                self.prev_id += index.get_n_items()
            elif f.endswith('saved_state'):
                self.mem_store = np.load(join(self.index_dir, f)).tolist()
        logger.info("Loaded {0} files with total {1} records for index {2}"
                    .format(len(self.indexes), self.prev_id + 1, self.actor_urn))
Ejemplo n.º 52
0
class SimilarStringStore:

    def __init__(self, **kwargs):

        self.transformer = FeatureGenerator(k=1)

        print(self.transformer.n_features)

        self.store = AnnoyIndex(self.transformer.n_features)

    def vectorize(self, s):
        return self.transformer.transform(s)

    def add(self, id, s):
        ''' add a string to index '''

        vector = self.transformer.transform(s)
        self.store.add_item(int(id), vector)
        return vector

    def build(self):
        self.store.build(500)

    def save(self, filename='store.knn'):
        self.store.save(filename)

    def build_and_save(self, filename='store.knn'):
        self.build()
        self.save(filename)

    def load(self, filename='store.knn'):
        self.store.load(filename)


    def query(self, s):
        ''' query index '''
        vector = self.transformer.transform(s)
        neighbors = self.store.get_nns_by_vector(vector, 40)
        return neighbors


    def remove(self, id):
        ''' remove a string from the index '''
        pass
Ejemplo n.º 53
0
def create_collage(input_image, profile_name, version_count):
    """
    given an input image and an existing profile, create a set of new collages
    """
    profile_folder = PROFILES_DIRECTORY + profile_name + "/"
    if not os.path.exists(OUTPUT_DIRECTORY):
        os.makedirs(OUTPUT_DIRECTORY)
    # todo: load feature dimensions from profile
    nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean")
    print("loading trees...")
    nns_index.load(profile_folder + profile_name + ".tree")
    print("done.")
    subimage_index = pickle.load(
        open(profile_folder + profile_name + ".p", "rb"))
    template_image = Image.open(input_image)
    image_width, image_height = template_image.size[0], template_image.size[1]
    crop_width, crop_height = subimage_index[-1]["crop_width"], subimage_index[-1]["crop_height"]
    for i in xrange(version_count):
        print("Creating collage {}/{}...").format(i+1, version_count)
        output_image = template_image.copy()
        for x in xrange(0, image_width-crop_width, crop_width):
            for y in xrange(0, image_height-crop_height, crop_height):
                box = (x, y, x + crop_width, y + crop_height)
                crop_box = output_image.crop(box)
                crop_sample = crop_box.convert("LA").resize(SAMPLE_DIMENSION)
                gs_pixeldata = []
                for pixel in list(crop_sample.getdata()):
                    gs_pixeldata.append(pixel[0])
                image_neighbor = nns_index.get_nns_by_vector(gs_pixeldata, version_count)[i]
                substitute_image = Image.open(subimage_index[image_neighbor]["image"])
                substitute_crop = substitute_image.crop(
                subimage_index[image_neighbor]["box"])
                output_image.paste(substitute_crop, box)
        output_path = OUTPUT_DIRECTORY + str(i) + ".png"
        output_image.save(output_path, "PNG")
        print("done.")
    print("{} image(s) saved in {}").format(
            version_count, OUTPUT_DIRECTORY)
    return
Ejemplo n.º 54
0
    def merge_indicies(self, index_file_a, index_file_b, sender_urn):
        logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn))
        index_a = AnnoyIndex(self.feat_size, metric='euclidean')
        index_b = AnnoyIndex(self.feat_size, metric='euclidean')
        new_index = AnnoyIndex(self.feat_size, metric='euclidean')

        index_a.load(index_file_a)
        index_b.load(index_file_b)

        cnt = 0
        for i in range(index_a.get_n_items()):
            new_index.add_item(cnt, index_a.get_item_vector(i))
            cnt += 1

        for i in range(index_b.get_n_items()):
            new_index.add_item(cnt, index_b.get_item_vector(i))
            cnt += 1


        new_index_file = index_file_a + ".merged"

        index_a.unload()
        index_b.unload()

        new_index.build(self.n_trees)
        new_index.save(new_index_file)
        logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format(
                index_file_a,
                index_file_b,
                sender_urn,
                cnt))

        new_index.unload()
        pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction(
                new_index_file=new_index_file,
                index_file_a=index_file_a,
                index_file_b=index_file_b
        )
Ejemplo n.º 55
0
    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                url = 'http://www-nlp.stanford.edu/data/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v);
                
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)
        return annoy
Ejemplo n.º 56
0
    def test_load_save_get_item_vector(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [1.1, 2.2, 3.3])
        i.add_item(1, [4.4, 5.5, 6.6])
        i.add_item(2, [7.7, 8.8, 9.9])
 
        numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3])
        self.assertTrue(i.build(10))
        self.assertTrue(i.save('blah.ann'))
        numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6])
        j = AnnoyIndex(f)
        self.assertTrue(j.load('blah.ann'))
        numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])
Ejemplo n.º 57
0
    def _get_index(self, f, distance):
        input = "test/glove.twitter.27B.%dd.txt.gz" % f
        output = "test/glove.%d.%s.annoy" % (f, distance)
        output_correct = "test/glove.%d.%s.correct" % (f, distance)

        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = "https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz" % f
                print("downloading", url, "->", input)
                urlretrieve(url, input)

            print("adding items", distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, "rb")):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v)

            print("building index")
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)

        if not os.path.exists(output_correct):
            print("finding correct answers")
            f_output = open(output_correct, "w")
            for i in range(10000):
                js_slow = annoy.get_nns_by_item(i, 11, 100000)[1:]
                assert len(js_slow) == 10
                f_output.write(" ".join(map(str, js_slow)) + "\n")
            f_output.close()

        return annoy, open(output_correct)
Ejemplo n.º 58
0
    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v);
                
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)
        return annoy
Ejemplo n.º 59
0
 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('i.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)
     # Ensure specifying if prefault is allowed does not impact result
     j.save('j.tree', True)
     k = AnnoyIndex(10)
     k.load('j.tree', True)
     x = k.get_item_vector(99)
     self.assertEqual(u, x)
     k.save('k.tree', False)
     l = AnnoyIndex(10)
     l.load('k.tree', False)
     y = l.get_item_vector(99)
     self.assertEqual(u, y)