Python AnnoyIndex.load 예제들, annoy.AnnoyIndex.load Python 예제들

예제 #1

1

파일 보기

파일: ImageSearch.py 프로젝트: yifeim/XDATAYemen

class ImageSearchAnnoyCombo:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'):
        #load h5 data
        h5f = h5py.File(h5fname,'r')
        self.X = h5f[dset]
        #load filenames
        with open(imageListPath,'r') as f:
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(self.X.shape[1],'angular')
        self.A.load(annf)

    def run_query_approx(self,query,n=100,accuracy_factor = 5):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)

    def run_query_exact(self,query,n=1000,nsmall=100):
        #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory
        #use Annoy
        if n < nsmall:
            n = nsmall
        indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False)
        indexes_sorted = sorted(indexes)
        #use scipy cdist (or normalize first and do dot product for faster computation)
        #getting X by index from disc is very slow. 
        distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0]
        ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n,
        s_ind = np.argsort(distance[ind])#sort 
        nearest = ind[s_ind]
        scoresorted = distance[ind][s_ind]
        return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)

예제 #2

0

파일 보기

파일: retrieve.py 프로젝트: eshwaran/annoy-java

def do(indextype):
    a = AnnoyIndex(8, indextype[0])
    a.load('points.%s.annoy' % indextype)
    with open('points.%s.ann.txt' % indextype, 'w') as out:
        for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]:
            nns = a.get_nns_by_item(q_index, 10)
            print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))

예제 #3

0

파일 보기

파일: index_test.py 프로젝트: spotify/annoy

    def test_overwrite_index(self):
        # Issue #335
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)
        t.save('test.ann')

        # Load index file
        t2 = AnnoyIndex(f)
        t2.load('test.ann')

        # Overwrite index file
        t3 = AnnoyIndex(f)
        for i in range(500):
            v = [random.gauss(0, 1) for z in range(f)]
            t3.add_item(i, v)
        t3.build(10)
        if os.name == 'nt':
            # Can't overwrite on Windows
            with self.assertRaises(IOError):
                t3.save('test.ann')
        else:
            t3.save('test.ann')
            # Get nearest neighbors
            v = [random.gauss(0, 1) for z in range(f)]
            nns = t2.get_nns_by_vector(v, 1000)  # Should not crash

예제 #4

0

파일 보기

파일: accuracy_test.py 프로젝트: spotify/annoy

    def _get_index(self, dataset):
        url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
        vectors_fn = os.path.join('test', dataset + '.hdf5')
        index_fn = os.path.join('test', dataset + '.annoy')

        if not os.path.exists(vectors_fn):
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f

예제 #5

0

파일 보기

파일: ann_retrieval.py 프로젝트: nlpaueb/BioIR

    def retrieve(self):

        print 'Loading necessary files..'
        u = AnnoyIndex(self.dim, metric='angular')
        u.load(index_file)

        print 'ANN Retrieval..'
        for n_neighbors in knns:
            print 'Number of neighbors: ' + str(n_neighbors)
            for mult in self.multipliers:
                print 'Multiplier: ' + str(mult)
                search_k = self.n_trees * n_neighbors * mult
                filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1]))
                with open(self.test_file, 'r') as data_file:
                    data = json.load(data_file)
                    qArray = []
                    for i in range(len(data["questions"])):
                        question_body = data["questions"][i]["body"]
                        question_id = data["questions"][i]["id"]
                        qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim)))

                        anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k)
                        doc_anns = []
                        for n in anns:
                            doc_anns.append(self.idmap[n])
                        q = Question(question_body, question_id, doc_anns)
                        qArray.append(q)
                    directory = "system_results/"
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile:
                        outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))

예제 #6

0

파일 보기

파일: aucoder.py 프로젝트: pombredanne/aucoder

def build_annoy_index(corpus, dimension, winlen, winstep):
    print "Adding to Annoy index"
    index = AnnoyIndex(dimension, "euclidean")
    mfcc_list = []
    i = 0
    for filename, frames in corpus:
#        print filename, frames.shape
        for index_in_file, mfcc in enumerate(frames):
            mfcc_list.append((filename, index_in_file))
            index.add_item(i, mfcc.tolist())
            assert mfcc_list[i] == (filename, index_in_file)
            i += 1

    opts = {"samplerate": desired_samplerate,
            "winlen": winlen,
            "winstep": winstep,
            "numcep": 13,
            "nfilt": 26,
            "nfft": 512,
            "ntrees": ANN_NTREES
            }
    cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree"
    
    if not os.path.exists(cache_filename):
        print "Building Annoy index with %d trees" % ANN_NTREES
    #    index.build(-1)
        index.build(ANN_NTREES)
        index.save(cache_filename)
        print "\tWrote cache to %s" % cache_filename
    else:
        print "\tReading cache from %s" % cache_filename
        index.load(cache_filename)
    return index, mfcc_list

예제 #7

0

파일 보기

파일: hamming_index_test.py 프로젝트: spotify/annoy

    def test_zero_vectors(self):
        # Mentioned on the annoy-user list
        bitstrings = [
            '0000000000011000001110000011111000101110111110000100000100000000',
            '0000000000011000001110000011111000101110111110000100000100000001',
            '0000000000011000001110000011111000101110111110000100000100000010',
            '0010010100011001001000010001100101011110000000110000011110001100',
            '1001011010000110100101101001111010001110100001101000111000001110',
            '0111100101111001011110010010001100010111000111100001101100011111',
            '0011000010011101000011010010111000101110100101111000011101001011',
            '0011000010011100000011010010111000101110100101111000011101001011',
            '1001100000111010001010000010110000111100100101001001010000000111',
            '0000000000111101010100010001000101101001000000011000001101000000',
            '1000101001010001011100010111001100110011001100110011001111001100',
            '1110011001001111100110010001100100001011000011010010111100100111',
        ]
        vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

        f = 64
        idx = AnnoyIndex(f, 'hamming')
        for i, v in enumerate(vectors):
            idx.add_item(i, v)

        idx.build(10)
        idx.save('idx.ann')
        idx = AnnoyIndex(f, 'hamming')
        idx.load('idx.ann')
        js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
        self.assertEquals(js[0], 0)
        self.assertEquals(ds[:4], [0, 1, 1, 22])

예제 #8

0

파일 보기

파일: utils.py 프로젝트: beamandrew/patient2vec

def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"):
    index = AnnoyIndex(df.shape[1])
    index.load(index_file)
    patient_dict = {}
    for key, val in csv.reader(open(patient_dict_file)):
        patient_dict[key] = int(val)
    index_dict = {}
    for key, val in csv.reader(open(index_dict_file)):
        index_dict[int(key)] = val
    print("Computing nearest-neighbors...")
    neighbor_dict = {}
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False)
        neighbor_ids = [index_dict[x] for x in neighbors]
        neighbor_dict[patient_id] = neighbor_ids
    f = open(out_dir+"patient_walks.txt", 'wb')
    for i in range(index.get_n_items()):
        if i % 1000 == 0:
            print str(i)
        patient_id = index_dict[i]
        patient_sentences = ""
        for j in range(walks_per_patient):
            sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict,
                                        n_neighbors=n_neighbors,walk_size=walk_size)
            patient_sentences = sentence + "\n"
            ## Write it ##
        f.write(patient_sentences)

예제 #9

0

파일 보기

 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)

예제 #10

0

파일 보기

 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])

예제 #11

0

파일 보기

파일: tracks_to_assignments.py 프로젝트: susurrant-audio/susurrant-py

class FeatureNN:
    tree = None

    def __init__(self, features, tree_file):
        self.tree = AnnoyIndex(features, metric='euclidean')
        self.tree.load(str(tree_file))

    def nn(self, x):
        return self.tree.get_nns_by_vector(x.tolist(), 1)[0]

예제 #12

0

파일 보기

파일: lshsplit_retrieval.py 프로젝트: ankit-cliqz/query_embeddings

def main():

    # Annoy Vector Dimension
    vec_dimension =100

    models_dir = "/raid/ankit/ann_models/"
    start = time.time()
    print "Starting: Loading of memory mapped models ... "
    # Load all models - memory mapped - quick
    ann1 = AnnoyIndex(vec_dimension)
    ann1.load(models_dir+"model10_split1.ann")

    ann2 = AnnoyIndex(vec_dimension)
    ann2.load(models_dir+"model10_split2.ann")

    ann3 = AnnoyIndex(vec_dimension)
    ann3.load(models_dir+"model10_split3.ann")

    ann4 = AnnoyIndex(vec_dimension)
    ann4.load(models_dir+"model10_split4.ann")

    ann5 = AnnoyIndex(vec_dimension)
    ann5.load(models_dir+"model10_split5.ann")
    end =time.time()

    print "All annoy-lsh models loaded! Time Taken: "+str((end-start)/60)+ " minutes."


    print "\nSimilar Queries - LSH Interface [All Top Queries]"
    print "----------------------------------------------------"

    flag = "True"
    while (flag == "True"):
        testquery = raw_input("Enter Query: ")
        nearest_num = raw_input("Number of similar queries: ")
        if nearest_num == 0 or nearest_num == "":
            nearest_num = 10
        nearest_num = int(nearest_num)
        if not testquery.strip() =="":
            lsh_list_n = get_similar_queries(testquery.strip(), nearest_num, ann1, ann2, ann3, ann4, ann5)

            # Return and Print the Top 10 nearest Queries to the Original Query
            print "\nCandidate Nearest Queries [TOP 10]: "
            count = 0
            for query,distance in lsh_list_n:
                if count == nearest_num:
                    break
                print str(query)+"\t"+str(distance)
                count+=1

            user_input = raw_input("\nDo you wish to continue again? (Type 'no' to quit): ")
            if user_input == "no":
                print "\nGoodbye!"
                break
            else:
                print "\n"
                continue

예제 #13

0

파일 보기

 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])

예제 #14

0

파일 보기

 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('x.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)

예제 #15

0

파일 보기

파일: ANN.py 프로젝트: real-limoges/match-terpiece

def get_tree_index(metric='angular', size=4096):
    '''
    INPUT: Optional parameters for the metric space and size of AnnoyIndex 
    OUTPUT: AnnoyIndex tree, dictionary of node assignment to image names
    '''
    tree = AnnoyIndex(size, metric=metric)
    tree.load(DATA_DIR + 'tree_' + metric + '.ann')

    with open(DATA_DIR + 'indexes_' + metric, 'rb') as f:
        indexes = pickle.load(f)

    return tree, indexes

예제 #16

0

파일 보기

파일: on_disk_build_test.py 프로젝트: spotify/annoy

 def test_on_disk(self):
     f = 2
     i = AnnoyIndex(f, 'euclidean')
     i.on_disk_build('on_disk.ann')
     self.add_items(i)
     i.build(10)
     self.check_nns(i)
     i.unload()
     i.load('on_disk.ann')
     self.check_nns(i)
     j = AnnoyIndex(f, 'euclidean')
     j.load('on_disk.ann')
     self.check_nns(j)

예제 #17

0

파일 보기

파일: eval_annoy.py 프로젝트: RowenaWong/hdidx

def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    f = data.base.shape[1]

    for ntrees in args.ntrees:
        t = AnnoyIndex(f)   # Length of item vector that will be indexed
        idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees)
        if not os.path.exists(idxpath):
            logging.info("Adding items ...")
            for i in xrange(data.nbae):
                t.add_item(i, data.base[i])
                if i % 100000 == 0:
                    logging.info("\t%d/%d" % (i, data.nbae))
            logging.info("\tDone!")
            logging.info("Building indexes ...")
            t.build(ntrees)
            logging.info("\tDone!")
            t.save(idxpath)
        else:
            logging.info("Loading indexes ...")
            t.load(idxpath)
            logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))

예제 #18

0

파일 보기

파일: io.py 프로젝트: kiminh/ann-server

def load_index(path_index: PathType,
               meta_d: Dict) \
        -> AnnoyIndex:
    """ We rely on ANNOY's usage of mmap to be fast loading
    (fast enough that we can load it on every single call)
    """
    n_dim = meta_d['n_dim']
    metric = meta_d['metric']
    u = AnnoyIndex(
        n_dim,
        metric=metric,
    )
    u.load(str(path_index))
    u.set_seed(SEED)
    return u

예제 #19

0

파일 보기

class AnnoyLookup(object):
    def __init__(self, metadata_path):
        with open(os.path.join(metadata_path, "metadata.json")) as f:
            self._data = json.load(f)
        self._index = AnnoyIndex(self._data["feature_length"],
                                 metric="angular")
        self._index.load(os.path.join(metadata_path, "index.ann"))

    def get_neighbours(self, embedding, max_neigh=3):
        items, distances = self._index.get_nns_by_vector(
            embedding, max_neigh, include_distances=True)
        zipped = zip(items, distances)
        sorted_list = sorted(zipped, key=lambda t: t[1])
        return [(self._data["filenames"][idx], distance)
                for idx, distance in sorted_list]

예제 #20

0

파일 보기

파일: hamming_index_test.py 프로젝트: spotify/annoy

 def test_save_load(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     i.save('blah.ann')
     j = AnnoyIndex(f, 'hamming')
     j.load('blah.ann')
     rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))

예제 #21

0

파일 보기

파일: annoy.py 프로젝트: cii-suke/annoy-multi-index

class _Annoy(object):
    def __init__(self, feature):
        model_path = 'model.ann'.format()
        n_dim = feature.shape[1] * feature.shape[2]
        feature = feature.reshape(feature.shape[0], n_dim)
        self.t = AnnoyIndex(n_dim, 'angular')
        if not os.path.exists(model_path):
            for i, f in enumerate(tqdm(feature)):
                # normarize
                v = f / np.sum(f)
                self.t.add_item(i, v)
            self.t.build(10)
            self.t.save(model_path)
        else:
            self.t.load(model_path)

예제 #22

0

파일 보기

def createAnnoyIndex(d, targetPoints, n_trees):
    #create AnnoyIndex in R^(2*d)
    targetIndex = AnnoyIndex(2 * d, metric='euclidean')
    #add each of the projected target points
    for i in range(targetPoints.shape[0]):
        targetIndex.add_item(i, projectToTorus(targetPoints[i]))

    #build the LSH-forest with the target points
    targetIndex.build(n_trees)

    #save and load with memory map
    targetIndex.save("LSHForest.ann")
    loadedIndex = AnnoyIndex(2 * d, metric='euclidean')
    loadedIndex.load("LSHForest.ann")
    return loadedIndex

예제 #23

0

파일 보기

파일: hamming_index_test.py 프로젝트: MeggyCal/annoy

 def test_save_load(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     i.save('blah.ann')
     j = AnnoyIndex(f, 'hamming')
     j.load('blah.ann')
     rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u - v, u - v))

예제 #24

0

파일 보기

def get_nn_by_name(name):
    Session = sessionmaker(bind=engine)
    session = Session()
    # This command should be used to
    name_string = "%{0}%".format(name)
    #     print(name_string)
    result = session.query(annoy_table).filter(
        annoy_table.c.name.like(name_string)).first()
    u = AnnoyIndex(f)
    u.load('../../test.ann')  # super fast, will just mmap the file
    list_of_near = u.get_nns_by_item(result[1],
                                     4)  # will find the 5 nearest neighbors
    # remove current node
    list_of_near.remove(result[1])
    return list_of_near

예제 #25

0

파일 보기

파일: ann.py 프로젝트: ashwinkachhara/video-analogies

class ANN:
    def __init__(self, dimension):
        self.ann = AnnoyIndex(dimension)
    def addVectors(self,vectors):
        for idx,v in enumerate(vectors):
            self.ann.add_item(idx,v)
        self.ann.build(10)
    def query(self,vector):
        match = self.ann.get_nns_by_vector(vector,1)[0]
        # return self.ann.get_item_vector(match),match
        return match
    def save(self):
        self.ann.save("analogies.ann")
    def load(self,filename):
        self.ann.load(filename)

예제 #26

0

파일 보기

def main():
    parser = argparse.ArgumentParser(description='recommend system')
    parser.add_argument('--query',
                        '-q',
                        type=str,
                        default="",
                        help='query image path')
    parser.add_argument('--bbox',
                        '-b',
                        type=str,
                        default="",
                        help='bbox image')
    parser.add_argument('--genre',
                        '-g',
                        type=str,
                        default="tops",
                        help='genre')
    args = parser.parse_args()

    if args.query == "":
        raise ("")
    genre = args.genre

    data_path = []
    base = os.path.dirname(os.path.abspath(__file__))
    list_path = os.path.normpath(os.path.join(base, './img_list.txt'))
    with open(list_path, "r") as f:
        for line in f.readlines():
            data_path.append(line.rstrip())
    annoy_model = AnnoyIndex(256)
    annoy_model.load(base + "/{}.ann".format(genre))

    query_path = args.query
    bbox = [int(item) for item in args.bbox.split(",")]

    query_img = utils.read_image(query_path, color=True)
    croped_query_img = crop_img(query_img, bbox)
    comparing_hist = cv2.calcHist([croped_query_img], [0], None, [256],
                                  [0, 256])
    predict_indexes = annoy_model.get_nns_by_vector(comparing_hist,
                                                    5,
                                                    search_k=-1)
    predict_indexes = [
        data_path[idx].split("\\")[-1] for idx in predict_indexes
    ]
    #with open("recommend_image.json", "w") as f:
    json_data = json.dumps(predict_indexes)
    print(json_data)

예제 #27

0

파일 보기

def closest_topK(unseen_event, concept_embedding, concept_mapping, dim, topK=10, unseen_id=None):
    """
    unseen_event: (title: str, description: str)
    concept_embedding: {word_id : [emb]}
    concept_mapping: {word_id : word_string}
    """
    unseen_event_title_tags = jieba.analyse.extract_tags(unseen_event[0])

    # Switch textrank or embedrank
    if ARGS.embedrank:
        unseen_event_description_words = embedrank_getkeywords(unseen_event[1])
    elif ARGS.tfidf:
        unseen_event_description_words = tfidf_getkeywords(unseen_event[1])
    else:
        unseen_event_description_words = textrank_getkeywords(unseen_event[1])

    print('title words:', unseen_event_title_tags)
    print('description words:', unseen_event_description_words)
    keywords = [*unseen_event_title_tags, *unseen_event_description_words]

    # INVOLVE GENERE
    # try:
    #     for word in GENERE_TO_KEYWORDS[ID_TO_GENERE[unseen_id]]:
    #         if word not in keywords:
    #             keywords.append(word)
    # except KeyError:
    #     pass
    ### END OF INVOLVING GENERE

    print("keywords", keywords)
    # Generate the label embedding for a new item
    event_concept_embeddings = []
    for word in keywords:
        try:
            event_concept_embeddings.append(concept_embedding[concept_mapping[word]])
        except KeyError:
            continue
    unseen_event_vector = [ sum(value) / len(value) for value in  zip(*event_concept_embeddings)]
    if unseen_event_vector == []:
        unseen_event_vector = [0] * dim
    annoy_index = AnnoyIndex(dim)
    annoy_index.load('cc2vec_textrank.ann')
    # Find topK colest item according to the label embedding
    ranking_list = annoy_index.get_nns_by_vector(unseen_event_vector, 10, search_k=-1, include_distances=True)
    propgation_list = []
    for id_, score in zip(ranking_list[0], ranking_list[1]):
        propgation_list.append((id_, score))
    return unseen_event_vector, propgation_list

예제 #28

0

파일 보기

def get_top_k_tables(sample_info_dict, id_to_index, index_file, dim, k):
    u = AnnoyIndex(dim, 'angular')
    u.load(index_file)
    ranks, top_k = [], {}
    for sentence, info in sample_info_dict.items():
        table_id, embedding = info['table_id'], info['embedding']
        table_index = id_to_index[table_id]
        closest_tables = u.get_nns_by_vector(embedding, 1000000)
        rank = closest_tables.index(table_index)
        if rank < k:
            label = [0 for _ in range(k)]
            label[rank] = 1
            info['top_k'] = closest_tables[:k]
            info['labels'] = label
        ranks.append(rank)
    return ranks

예제 #29

0

파일 보기

파일: lshash.py 프로젝트: chenlei1976/ai-codes

    def debug():

        f = 40
        t = AnnoyIndex(f)  # Length of item vector that will be indexed
        for i in xrange(1000):
            v = [random.gauss(0, 1) for z in xrange(f)]
            t.add_item(i, v)

        t.build(10)  # 10 trees
        t.save('test.ann')

        # ...
        u = AnnoyIndex(f)
        u.load('test.ann')  # super fast, will just mmap the file
        print(u.get_nns_by_item(0,
                                1000))  # will find the 1000 nearest neighbors

예제 #30

0

파일 보기

    def load_index(self, index_id):
        if self.annoy_index is None:
            log.info("loading initial index with id {}", self.current_index)
        else:
            log.info("switching index from {} to {}", self.current_index,
                     index_id)

        newindex = AnnoyIndex(108, metric='euclidean')
        newindex.load(config.index_config['index_path'] + 'index_' +
                      str(index_id) + '.ann')
        if self.annoy_index is not None:
            self.annoy_index.unload()
        self.annoy_index = newindex
        self.current_index = index_id
        log.info("finished switching index. now using index {}",
                 self.current_index)

예제 #31

0

파일 보기

def test_build_sparse_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    sparse_data = csr_matrix(data)

    index = build_annoy_index(sparse_data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()

예제 #32

0

파일 보기

    def _load_index(self, wherefrom, index_key):
        """Load an AnnoyIndex from disk"""
        est = self.estimator_

        # I can't think of anything more clever because I've been up for
        # hours and hours and hours, so this is the kludgiest solution:
        if index_key == "similar_items_index":
            n_index = est.item_factors.shape[1]
        # Otherwise, "recommend_index"
        else:
            # This assumes approximate_recommend, since it's the only way
            # it will ever get to this code
            n_index = est.extra_
        index = AnnoyIndex(n_index, "angular")
        index.load(join(wherefrom, index_key))
        return index

예제 #33

0

파일 보기

파일: tnn.py 프로젝트: lkmklsmn/bbtnn

 def run(self):
     try:
         index = AnnoyIndex(self.n_dims, metric='euclidean')
         index.load(self.index_filepath)
         for i in range(self.data_indices[0], self.data_indices[1]):
             neighbour_indexes = index.get_nns_by_vector(self.X[i,:]
                 , self.k, search_k=self.search_k, include_distances=False)
             neighbour_indexes = np.array(neighbour_indexes,
                                             dtype=np.uint32)
             self.results_queue.put(
                 IndexNeighbours(row_index=i,
                                 neighbour_list=neighbour_indexes))
     except Exception as e:
         self.exception = e
     finally:
         self.results_queue.close()

예제 #34

0

파일 보기

파일: on_disk_build.py 프로젝트: idroz/annoy

    def test_on_disk(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.on_disk_build('test.ann')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])

        i.build(10)
        i.unload()

        i.load('test.ann')

        self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])

예제 #35

0

파일 보기

파일: accuracy_test.py 프로젝트: YuliaNikonova/celebrity_faces

    def test_celeba_embedding(self):
        PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json')))

        EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json')))


        INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann')))

        NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw')))

        TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME',
            os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json')))

        with open(PATHS_JSON, 'r') as fp:
            print('Loading paths')
            paths = np.array(json.load(fp))
        with open(EMBEDDING_JSON, 'r') as fp:
            print('Loading embeddings')
            embeddings = json.load(fp)

        with open(TEST_CASES_FILENAME, 'r') as fp:
            print('Loading test_cases')
            test_cases = json.load(fp)


        annoy = AnnoyIndex(len(embeddings[0]))    
        annoy_index = annoy.load(INDEX_FILENAME)

        print('building nsw index')
        nsw_index = PyNSW('l2')
        print('Creating nodes')
        nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
        print('Inserting nodes')
        for node in tqdm(nodes):
            nsw_index.nn_insert(node, 5, 1000)

        n, k_annoy, k_nsw = 0, 0, 0

        print('Calculating accuracy on CelebA')

        for tk in test_cases:
            vector = embeddings[int(tk['embedding_index'])]
            
            closest_paths_real = tk['closest_paths_real']

            closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)]

            closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)]

            assert len(closest_paths_real) == 10
            assert len(closest_paths_annoy) == 10
            assert len(closest_paths_nsw) == 10

            n += 10
            k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real))
            k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real))


        print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n))
        print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))

예제 #36

0

파일 보기

파일: ImageSearch.py 프로젝트: yifeim/XDATAYemen

class ImageSearchAnnoy:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,dimensions,annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt'):
        #load filenames
        with open(imageListPath,'r') as f:
            #self.line_to_file = {i:line.split('/')[-1].rstrip() for i,line in enumerate(f)}
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(dimensions,'angular')
        self.A.load(annf)

    def run_query(self,query,n=100,accuracy_factor = 2):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)

예제 #37

0

파일 보기

파일: ninteenth.py 프로젝트: shishi11/nlp

 def test(self):
     # feat=np.random.random((100000,4096))
     # annoyIndex = AnnoyIndex(4096)
     # annoyIndex.on_disk_build('a')
     # for i,v in enumerate(feat):
     #     annoyIndex.add_item(i,v)
     # for i,v in enumerate(feat):
     #     annoyIndex.add_item(i,v)
     t = time.time()
     #
     # annoyIndex.build(100)
     # print(time.time()-t)
     annoyIndex = AnnoyIndex(4096)
     annoyIndex.load('a')
     print(annoyIndex.get_nns_by_item(0, 5))
     print(time.time() - t)

예제 #38

0

파일 보기

def init():
    global indices
    indices = defaultdict(lambda: defaultdict(dict))
    for dim in ['matrix', 'tensor']:
        for size in [500, 5000]:#, 1000, 5000, 10000]:
            folder = 'data/' + dim + '/200x' + str(size)
            # loading the index
            t = AnnoyIndex(20, 'angular')
            t.load(folder + '/embd.ann')
            indices[dim][size]['index'] = t
            # loading the extractions
            exts = pd.read_csv(folder + '/extr_index.csv')
            ext2idx = dict(zip((x['modifier'] + ';' + x['aspect'] \
                                for _, x in exts.iterrows()), range(len(exts))))
            indices[dim][size]['exts'] = exts
            indices[dim][size]['ext2idx'] = ext2idx

예제 #39

0

파일 보기

def load_indexes(ann_filepath=None, celeb_mapping_path=None):
    home = expanduser("~")
    if ann_filepath is None:
        ann_filepath = os.path.join(home, 'celeb_index_60.ann')
        celeb_ann_id = '1-3Wb7fiINbrk9FSagTxjLdSjp7KzrMp7'
        if not os.path.exists(ann_filepath):
            download_file_from_google_drive(celeb_ann_id, ann_filepath)

    if celeb_mapping_path is None:
        celeb_mapping_path = os.path.join(home, 'celeb_mapping.json')
        celeb_mapping_file_id = '1wDaaSQ6NjxLkxpzYyTRknefizZUKnKDj'
        if not os.path.exists(celeb_mapping_path):
            download_file_from_google_drive(celeb_mapping_file_id,
                                            celeb_mapping_path)

    ann_index = AnnoyIndex(2048, 'angular')
    _ = ann_index.load(ann_filepath)

    with open(celeb_mapping_path) as json_file:
        celeb_mapping_temp = json.load(json_file)
    celeb_mapping_dict = {}
    for key, value_list in celeb_mapping_temp.items():
        for each_id in value_list:
            celeb_mapping_dict[each_id] = str(key)

    return ann_index, celeb_mapping_dict

예제 #40

0

파일 보기

파일: insert_ddb_multi_thread.py 프로젝트: Sean-Chuang/item2item-exp

def fetch_topK_similar(items_vec_file, ann_model_file, dim, topK, item_idx_map, items_list_batch, ddb_table, company_label):
    b_time = time.time()
    log.debug("[fetch_topK_similar] Start to get topK items")
    ann_model = AnnoyIndex(dim, 'angular')
    ann_model.load(ann_model_file)
    update_data = {}
    items_set = set([item for sublist in items_list_batch for item in sublist])
    print(items_list_batch)
    print(items_set)
    with open(items_vec_file, 'r') as in_f:
        num_items, dim = in_f.readline().strip().split()
        for idx, line in enumerate(in_f):
            tmp = line.split()
            item_id = tmp[0]
            if item_id in items_set:
                action, content_id = item_id.split(':', 1)
                item_emb = list(map(float, tmp[1:]))
                if item_label not in update_data:
                    update_data[item_label] = {'item_id': item_label, 'label': company_label}

                res_dict = OrderedDict()
                topK_item, topK_dist = ann_model.get_nns_by_vector(item_emb, topK*3, include_distances=True)
                for item_idx, dist in zip(topK_item, topK_dist):
                    try:
                        item = item_idx_map[item_idx].split(':', 1)[1].strip()
                        if item not in res_dict:
                            res_dict[item] = Decimal(f"{1-dist:.4f}")
                            # Todo: maybe do score normalize here
                    except Exception as err:
                        log.error(err)
                        log.warning(f"Couldn't find item name : {item_idx_map[item_idx]}")
                    if len(res_dict) == topK:
                        break

                if action == Action.View.value:
                    update_data[item_label]['view_similar'] = res_dict
                elif action == Action.AddToCart.value:
                    update_data[item_label]['add_cart_similar'] = res_dict
                elif action == Action.Purchase.value:
                    update_data[item_label]['purchase_similar'] = res_dict
                else:
                    log.warning(f"{e} -> {action} not a valided action...")
                    continue

    log.debug(f"[Time|fetch_topK_similar] Cost : {time.time() - b_time}")
    if len(update_data) > 0:
        insert_ddb(ddb_table, company_label, update_data)

예제 #41

0

파일 보기

class ChexSearch(object):
    """ Searches Chex index for game states and associated games. """

    #TODO: Combine results of board transforms with binary search algo.

    def __init__(self, chex_index, results=10, search_k=40):
        self.chex_index = chex_index
        self.results = results
        self.search_k = search_k
        self.annoy_index = AnnoyIndex(_bitboard_length, metric='angular')
        self.annoy_index.load(os.path.join(self.chex_index, 'annoy.idx'))
        self.chex_sql = SqliteDict(os.path.join(self.chex_index, 'sqlite.idx'))

    def search(self, board):
        """ Searches for board.

            board: game object of type chess.Board

            Return value: [
                (board, similarity score, [(game_id, move number), ...]), ...]
        """

        symmetrical_boards = [
            board_to_bitboard(board),
            invert_board(board),
            flip_board(board),
            reverse_and_flip(board)
        ]
        results = []
        for bitboard in symmetrical_boards:
            for annoy_id, similarity in zip(
                    *self.annoy_index.get_nns_by_vector(
                        bitboard, self.results, include_distances=True)):
                # Recompute ASCII key
                bitboard = self.annoy_index.get_item_vector(annoy_id)
                to_unhexlify = '%x' % int(
                    ''.join(map(str, map(int, bitboard))), 2)
                try:
                    key = binascii.unhexlify(to_unhexlify)
                except TypeError:
                    key = binascii.unhexlify('0' + to_unhexlify)
                results.append((bitboard_to_board(bitboard), similarity,
                                self.chex_sql[key]))
        return results

    def close(self):
        del self.annoy_index

예제 #42

0

파일 보기

def main():
    # mnist画像の読み込み処理
    train_imgs, train_lbls, test_imgs, test_lbls = load_mnist()
    print(train_imgs.shape, train_lbls.shape, test_imgs.shape, test_lbls.shape)

    if not os.path.isfile('./static/mnist_db.ann'):
        make_annoy_db(train_imgs)  # annoydbのビルド
    annoy_db = AnnoyIndex((28 * 28), metric='euclidean')
    annoy_db.load('./static/mnist_db.ann')  # annoyのデータベースをロードする

    # テストデータを入力して近い近傍を取ってきて実際と比較することで試しに精度をみてみる
    y_pred = [
        train_lbls[annoy_db.get_nns_by_vector(test_img.flatten(), 1)[0]]
        for test_img in test_imgs
    ]
    score = accuracy_score(test_lbls, y_pred)
    print('acc:', score)

예제 #43

0

파일 보기

파일: 4_baseline.py 프로젝트: khornlund/forecast-veg

def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if (os.path.isfile(saving_model)):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ['latitude', 'longitude', 'time_period']])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return (t)

예제 #44

0

파일 보기

파일: ninteenth.py 프로젝트: shishi11/nlp

    def test1(self):
        rows = self.query_country_name('%')
        annoyIndex = AnnoyIndex(768)
        # for i,row in enumerate(rows):
        #     encode=self.bc.encode([row[1]])
        #     annoyIndex.add_item(i,encode[0])
        # annoyIndex.build(10)
        # annoyIndex.save('articles')
        annoyIndex.load('articles')
        result, index = annoyIndex.get_nns_by_item(10,
                                                   5,
                                                   include_distances=True)
        print(rows[10])
        print(np.cos(index))
        for i in result:

            print(rows[i])

예제 #45

0

파일 보기

    def get_similar_items(self, product_id: int,
                          rec_type: int) -> pd.DataFrame:
        '''
        Function that creates recommendation lists.

        The intuition behind using less components is reducing the number of latent factors
        that can be inferred. And, by excluding item features for the CAB model, recommendations
        will be less based off explicit features such as `aisle` and `department`.
        -------------------
        type:
        1 - Similar Items [DEFAULT_PARAMS]
        2 - Complement Items [CAB_PARAMS]
        '''
        logging.info(
            f'Logging recommendations for {self.model.config.ANNOY_PARAMS[rec_type]}'
        )
        if rec_type == 1:
            annoy_model = AnnoyIndex(
                self.model.config.LIGHTFM_PARAMS['no_components'])
            annoy_model.load(self.config.PATHS.models + '/item.ann')
        elif rec_type == 2:
            annoy_model = AnnoyIndex(
                self.model.config.LIGHTFM_CAB_PARAMS['no_components'])
            annoy_model.load(self.config.PATHS.models + '/item_cab.ann')
        similar_variants = annoy_model.get_nns_by_item(
            product_id,
            self.model.config.ANNOY_PARAMS['nn_count'],
            search_k=-1,
            include_distances=False)

        logging.info(type(similar_variants))
        logging.info(similar_variants)
        similar_variants_df = self.item_df.iloc[similar_variants, :]

        similarVariantsTable = PrettyTable(
            ['product_id', 'product_name', 'aisle', 'department', 'num'])
        similarVariantsTable.add_row([
            similar_variants_df['product_id'],
            similar_variants_df['product_name'], similar_variants_df['aisle'],
            similar_variants_df['department'], similar_variants_df['num']
        ])
        logging.info(
            f'{self.model.config.ANNOY_PARAMS[rec_type]} Data: \n{similarVariantsTable}'
        )

        return similar_variants_df

예제 #46

0

파일 보기

파일: 4_baseline.py 프로젝트: JohnNay/forecastVeg

def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if os.path.isfile(saving_model):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return t

예제 #47

0

파일 보기

def read_annoy(embedding):
    f = 64  # Embedding size
    u = AnnoyIndex(f, 'euclidean')
    u.load('./neural_networks/models/saved_annoy.ann'
           )  # super fast, will just mmap the file
    n = 1  # Num neighbors
    neighbors = []
    distances = []
    for emb in embedding:
        neighbor, dist = u.get_nns_by_vector(emb,
                                             n,
                                             search_k=-1,
                                             include_distances=True)
        neighbors.append(neighbor)
        distances.append(dist)

    return neighbors, distances

예제 #48

0

파일 보기

파일: image_search.py 프로젝트: yhy1993824/Image-Similarity-Search

 def similarity_search(image, num_closest_items, hash_table_file_path,
                       image_hash_file_path):
     """Input: image  Output: a list of images similar to the input
        Get the feature set associated with the image.
        Use feature set to query the ANNoy hashmap.
     """
     graph = create_graph(model_path)
     features = get_features_from_graph(graph, image_path)
     hash_table = AnnoyIndex(len(features))
     hash_table.load(hash_table_file_path)
     image_hash_table = pickle.load(image_hash_file_path)
     for i in xrange(len(features)):
         search_results = hash_table.get_nns_by_vector(
             features[i], num_closest_items, include_distances=True)
         file_path = image_hash_table[
             search_results]  #Translate integer into image path.
         print(search_results)

예제 #49

0

파일 보기

파일: ann.py 프로젝트: cyborgflashtime/txtai

class Annoy(ANN):
    """
    Builds an ANN model using the Annoy library.
    """
    def load(self, path):
        # Load index
        self.model = AnnoyIndex(self.config["dimensions"],
                                self.config["metric"])
        self.model.load(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "dot"

        # Create index
        self.model = AnnoyIndex(self.config["dimensions"],
                                self.config["metric"])

        # Add items
        for x in range(embeddings.shape[0]):
            self.model.add_item(x, embeddings[x])

        # Build index
        self.model.build(self.setting("ntrees", 10))

    def search(self, queries, limit):
        # Lookup search k setting
        searchk = self.setting("searchk", -1)

        # Annoy doesn't have a built in batch query method
        results = []
        for query in queries:
            # Run the query
            ids, scores = self.model.get_nns_by_vector(query,
                                                       n=limit,
                                                       search_k=searchk,
                                                       include_distances=True)

            # Map results to [(id, score)]
            results.append(list(zip(ids, scores)))

        return results

    def save(self, path):
        # Write index
        self.model.save(path)

예제 #50

0

파일 보기

파일: model.py 프로젝트: artfawl/Project_Bot

class FoodGetter:
    def __init__(self):
        pass

    def load(self, data_path, doc2vec_path, annoy_path):
        """
            Обучатся не в состоянии, 
            можно только загрузить готовые модельки с диска
            
            Загружается секунд 5 - 7, это нормально
        """
        self.data = pd.read_csv(data_path)
        self.data.ingredients = self.data.ingredients.apply(eval)
        self.data.steps = self.data.steps.apply(eval)

        self.doc2vec_model = gensim.models.doc2vec.Doc2Vec.load(doc2vec_path)
        self.length = len(self.doc2vec_model.infer_vector([" "]))

        self.annoy_model = AnnoyIndex(self.length, 'angular')
        self.annoy_model.load(annoy_path)

    def find(self, _input, N=5):
        """
            На вход: строка с ингредиентами
            Выход: генерируем кортеж из имени, список ингредиентов и индекс
            Если ничего не найдено возвращается пустой массив
        """
        _input = _input.split(" ")
        _res = set(_input)
        idx = self.annoy_model.get_nns_by_vector(
            self.doc2vec_model.infer_vector(_input), 1000, search_k=2000)
        res = filter(
            lambda index: (lambda x: len(_res & x) / len(x) > 0.55)
            (set(self.data.ingredients[index])), idx)
        ans = []
        for i, index in enumerate(res):
            if i == N:
                return ans
            temp = self.data.loc[index]
            ans.append((temp["name"], " ".join(temp.ingredients), index))
        return ans

    def get_steps(self, idx):
        "Для понравившегося индекса возвращаем инструкцию по приготовлению и номер шага"
        return enumerate(self.data.loc[idx].steps)

예제 #51

0

파일 보기

파일: worker.py 프로젝트: kostyaev/ann-search

    def load(self):
        self.prev_id = -1
        self.indexes = []
        logger.info("Loading index {0}".format(self.actor_urn))
        for index in self.indexes:
            index.unload()

        for f in sorted(listdir(self.index_dir)):
            if f.endswith(".ann"):
                self.index_files.append(join(self.index_dir,f))
                index = AnnoyIndex(self.feat_size, metric='euclidean')
                index.load(join(self.index_dir, f))
                self.indexes.append(index)
                self.prev_id += index.get_n_items()
            elif f.endswith('saved_state'):
                self.mem_store = np.load(join(self.index_dir, f)).tolist()
        logger.info("Loaded {0} files with total {1} records for index {2}"
                    .format(len(self.indexes), self.prev_id + 1, self.actor_urn))

예제 #52

0

파일 보기

파일: __init__.py 프로젝트: audy/nearproteins

class SimilarStringStore:

    def __init__(self, **kwargs):

        self.transformer = FeatureGenerator(k=1)

        print(self.transformer.n_features)

        self.store = AnnoyIndex(self.transformer.n_features)

    def vectorize(self, s):
        return self.transformer.transform(s)

    def add(self, id, s):
        ''' add a string to index '''

        vector = self.transformer.transform(s)
        self.store.add_item(int(id), vector)
        return vector

    def build(self):
        self.store.build(500)

    def save(self, filename='store.knn'):
        self.store.save(filename)

    def build_and_save(self, filename='store.knn'):
        self.build()
        self.save(filename)

    def load(self, filename='store.knn'):
        self.store.load(filename)


    def query(self, s):
        ''' query index '''
        vector = self.transformer.transform(s)
        neighbors = self.store.get_nns_by_vector(vector, 40)
        return neighbors


    def remove(self, id):
        ''' remove a string from the index '''
        pass

예제 #53

0

파일 보기

파일: annabel.py 프로젝트: tvldz/annabel

def create_collage(input_image, profile_name, version_count):
    """
    given an input image and an existing profile, create a set of new collages
    """
    profile_folder = PROFILES_DIRECTORY + profile_name + "/"
    if not os.path.exists(OUTPUT_DIRECTORY):
        os.makedirs(OUTPUT_DIRECTORY)
    # todo: load feature dimensions from profile
    nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean")
    print("loading trees...")
    nns_index.load(profile_folder + profile_name + ".tree")
    print("done.")
    subimage_index = pickle.load(
        open(profile_folder + profile_name + ".p", "rb"))
    template_image = Image.open(input_image)
    image_width, image_height = template_image.size[0], template_image.size[1]
    crop_width, crop_height = subimage_index[-1]["crop_width"], subimage_index[-1]["crop_height"]
    for i in xrange(version_count):
        print("Creating collage {}/{}...").format(i+1, version_count)
        output_image = template_image.copy()
        for x in xrange(0, image_width-crop_width, crop_width):
            for y in xrange(0, image_height-crop_height, crop_height):
                box = (x, y, x + crop_width, y + crop_height)
                crop_box = output_image.crop(box)
                crop_sample = crop_box.convert("LA").resize(SAMPLE_DIMENSION)
                gs_pixeldata = []
                for pixel in list(crop_sample.getdata()):
                    gs_pixeldata.append(pixel[0])
                image_neighbor = nns_index.get_nns_by_vector(gs_pixeldata, version_count)[i]
                substitute_image = Image.open(subimage_index[image_neighbor]["image"])
                substitute_crop = substitute_image.crop(
                subimage_index[image_neighbor]["box"])
                output_image.paste(substitute_crop, box)
        output_path = OUTPUT_DIRECTORY + str(i) + ".png"
        output_image.save(output_path, "PNG")
        print("done.")
    print("{} image(s) saved in {}").format(
            version_count, OUTPUT_DIRECTORY)
    return

예제 #54

0

파일 보기

파일: index_builder.py 프로젝트: kostyaev/ann-search

    def merge_indicies(self, index_file_a, index_file_b, sender_urn):
        logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn))
        index_a = AnnoyIndex(self.feat_size, metric='euclidean')
        index_b = AnnoyIndex(self.feat_size, metric='euclidean')
        new_index = AnnoyIndex(self.feat_size, metric='euclidean')

        index_a.load(index_file_a)
        index_b.load(index_file_b)

        cnt = 0
        for i in range(index_a.get_n_items()):
            new_index.add_item(cnt, index_a.get_item_vector(i))
            cnt += 1

        for i in range(index_b.get_n_items()):
            new_index.add_item(cnt, index_b.get_item_vector(i))
            cnt += 1


        new_index_file = index_file_a + ".merged"

        index_a.unload()
        index_b.unload()

        new_index.build(self.n_trees)
        new_index.save(new_index_file)
        logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format(
                index_file_a,
                index_file_b,
                sender_urn,
                cnt))

        new_index.unload()
        pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction(
                new_index_file=new_index_file,
                index_file_a=index_file_a,
                index_file_b=index_file_b
        )

예제 #55

0

파일 보기

파일: accuracy_test.py 프로젝트: atqamar/annoy

    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                url = 'http://www-nlp.stanford.edu/data/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v);
                
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)
        return annoy

예제 #56

0

파일 보기

    def test_load_save_get_item_vector(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [1.1, 2.2, 3.3])
        i.add_item(1, [4.4, 5.5, 6.6])
        i.add_item(2, [7.7, 8.8, 9.9])
 
        numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3])
        self.assertTrue(i.build(10))
        self.assertTrue(i.save('blah.ann'))
        numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6])
        j = AnnoyIndex(f)
        self.assertTrue(j.load('blah.ann'))
        numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])

예제 #57

0

파일 보기

파일: accuracy_test.py 프로젝트: subu-cliqz/annoy

    def _get_index(self, f, distance):
        input = "test/glove.twitter.27B.%dd.txt.gz" % f
        output = "test/glove.%d.%s.annoy" % (f, distance)
        output_correct = "test/glove.%d.%s.correct" % (f, distance)

        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = "https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz" % f
                print("downloading", url, "->", input)
                urlretrieve(url, input)

            print("adding items", distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, "rb")):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v)

            print("building index")
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)

        if not os.path.exists(output_correct):
            print("finding correct answers")
            f_output = open(output_correct, "w")
            for i in range(10000):
                js_slow = annoy.get_nns_by_item(i, 11, 100000)[1:]
                assert len(js_slow) == 10
                f_output.write(" ".join(map(str, js_slow)) + "\n")
            f_output.close()

        return annoy, open(output_correct)

예제 #58

0

파일 보기

파일: accuracy_test.py 프로젝트: pandasasa/annoy

    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v);
                
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)
        return annoy

예제 #59

0

파일 보기

파일: index_test.py 프로젝트: spotify/annoy

 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('i.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)
     # Ensure specifying if prefault is allowed does not impact result
     j.save('j.tree', True)
     k = AnnoyIndex(10)
     k.load('j.tree', True)
     x = k.get_item_vector(99)
     self.assertEqual(u, x)
     k.save('k.tree', False)
     l = AnnoyIndex(10)
     l.load('k.tree', False)
     y = l.get_item_vector(99)
     self.assertEqual(u, y)