Ejemplo n.º 1
0
def precision(f=40, n=1000000):
    t = AnnoyIndex(f)
    for i in xrange(n):
        v = []
        for z in xrange(f):
            v.append(random.gauss(0, 1))
        t.add_item(i, v)

    t.build(2 * f)
    t.save('test.tree')

    limits = [10, 100, 1000, 10000]
    k = 10
    prec_sum = {}
    prec_n = 1000
    time_sum = {}

    for i in xrange(prec_n):
        j = random.randrange(0, n)
        print 'finding nbs for', j
        
        closest = set(t.get_nns_by_item(j, n)[:k])
        for limit in limits:
            t0 = time.time()
            toplist = t.get_nns_by_item(j, limit)
            T = time.time() - t0
            
            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T

        for limit in limits:
            print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
Ejemplo n.º 2
0
    def test_write_failed(self):
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)

        if sys.platform == "linux" or sys.platform == "linux2":
            # linux
            try:
                t.save("/dev/full") 
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
        elif sys.platform == "darwin":
            volume = "FULLDISK"
            device = os.popen('hdiutil attach -nomount ram://64').read()
            os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device))
            os.popen('touch "/Volumes/%s/full"' % volume)
            try:
                t.save('/Volumes/%s/annoy.tree' % volume)
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
            finally:
                os.popen("hdiutil detach %s" % device)
Ejemplo n.º 3
0
def build_index(df,n_trees = 50,dist_metric='angular',out_dir="./"):
    n_records = df.shape[0]
    n_col = df.shape[1]
    index = AnnoyIndex(n_col,metric=dist_metric)
    patient_dict = {}
    index_dict = {}
    i = 0
    print "Adding items to the index..."
    for patient_id in df.index.values:
        if i % 10000 == 0:
            print str(i)
        vec = df.loc[patient_id].values
        index.add_item(i,vec)
        patient_dict[patient_id] = i
        index_dict[i] = patient_id
        i += 1
    print "Building the index..."
    index.build(n_trees)
    index.save(out_dir+"annoy_index.ann")
    ## Save the patient_id -> index mapping ##
    w = csv.writer(open(out_dir+"patient_mapping.csv", "w"))
    for key, val in patient_dict.items():
        w.writerow([key, val])
    w = csv.writer(open(out_dir+"index_mapping.csv", "w"))
    for key, val in index_dict.items():
        w.writerow([key, val])
Ejemplo n.º 4
0
 def test_single_vector(self):
     # https://github.com/spotify/annoy/issues/194
     a = AnnoyIndex(3)
     a.add_item(0, [1, 0, 0])
     a.build(10)
     a.save('1.ann')
     self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
Ejemplo n.º 5
0
    def _get_index(self, dataset):
        url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
        vectors_fn = os.path.join('test', dataset + '.hdf5')
        index_fn = os.path.join('test', dataset + '.annoy')

        if not os.path.exists(vectors_fn):
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f
Ejemplo n.º 6
0
def build_annoy_index(corpus, dimension, winlen, winstep):
    print "Adding to Annoy index"
    index = AnnoyIndex(dimension, "euclidean")
    mfcc_list = []
    i = 0
    for filename, frames in corpus:
#        print filename, frames.shape
        for index_in_file, mfcc in enumerate(frames):
            mfcc_list.append((filename, index_in_file))
            index.add_item(i, mfcc.tolist())
            assert mfcc_list[i] == (filename, index_in_file)
            i += 1

    opts = {"samplerate": desired_samplerate,
            "winlen": winlen,
            "winstep": winstep,
            "numcep": 13,
            "nfilt": 26,
            "nfft": 512,
            "ntrees": ANN_NTREES
            }
    cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree"
    
    if not os.path.exists(cache_filename):
        print "Building Annoy index with %d trees" % ANN_NTREES
    #    index.build(-1)
        index.build(ANN_NTREES)
        index.save(cache_filename)
        print "\tWrote cache to %s" % cache_filename
    else:
        print "\tReading cache from %s" % cache_filename
        index.load(cache_filename)
    return index, mfcc_list
Ejemplo n.º 7
0
    def test_zero_vectors(self):
        # Mentioned on the annoy-user list
        bitstrings = [
            '0000000000011000001110000011111000101110111110000100000100000000',
            '0000000000011000001110000011111000101110111110000100000100000001',
            '0000000000011000001110000011111000101110111110000100000100000010',
            '0010010100011001001000010001100101011110000000110000011110001100',
            '1001011010000110100101101001111010001110100001101000111000001110',
            '0111100101111001011110010010001100010111000111100001101100011111',
            '0011000010011101000011010010111000101110100101111000011101001011',
            '0011000010011100000011010010111000101110100101111000011101001011',
            '1001100000111010001010000010110000111100100101001001010000000111',
            '0000000000111101010100010001000101101001000000011000001101000000',
            '1000101001010001011100010111001100110011001100110011001111001100',
            '1110011001001111100110010001100100001011000011010010111100100111',
        ]
        vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

        f = 64
        idx = AnnoyIndex(f, 'hamming')
        for i, v in enumerate(vectors):
            idx.add_item(i, v)

        idx.build(10)
        idx.save('idx.ann')
        idx = AnnoyIndex(f, 'hamming')
        idx.load('idx.ann')
        js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
        self.assertEquals(js[0], 0)
        self.assertEquals(ds[:4], [0, 1, 1, 22])
Ejemplo n.º 8
0
def build_tree(df, metric):
    '''
    INPUTS: Pandas DataFrame, Choice of Metric Space String
    OUTPUTS: Returns the built AnnoyIndex tree, returns a dictionary
             mapping index numbers to the DataFrame's index

    Builds a ANN tree using Spotify's ANNoy library. Metric is the
    metric space (either euclidean or angular)
    '''
    tree = AnnoyIndex(len(df.iloc[0, :].values), metric=metric)

    indexes = {}

    for i in xrange(len(df)):
        v = df.iloc[i, :]
        indexes[i] = v.name
        tree.add_item(i, v.values)

    tree.build(50)

    tree.save(DATA_DIR + 'tree_' + metric + '.ann')
    with open(DATA_DIR + 'indexes_' + metric, 'wb') as f:
        pickle.dump(indexes, f)

    return (tree, indexes)
Ejemplo n.º 9
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
Ejemplo n.º 10
0
 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)
Ejemplo n.º 11
0
 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
Ejemplo n.º 12
0
def build_annoy_index(metric, input_filename, output_filename, n_trees):
# Creates an index for Approimate Nearest Neighbors retrieval, using the annoy library.
    print 'Aproximate Nearest Neighbors for: ' + input_filename
    centroids_array = np.load(input_filename)
    n_dimensions = centroids_array.shape[1]
    t = AnnoyIndex(n_dimensions, metric=metric)
    for i in range(centroids_array.shape[0]):
        t.add_item(i, centroids_array[i][:])
    print "Building Index - Number of Trees: ",str(n_trees)
    t.build(n_trees)
    t.save(output_filename)
def build_annoy_tree(word2vec_model,  output_file_name, n_trees=100):
    tree = AnnoyIndex(word2vec_model.layer1_size)
    for i, word in enumerate(word2vec_model.index2word):
        tree.add_item(i, list(word2vec_model[word]))

    tree.build(n_trees)


    tree.save(output_file_name)

    return output_file_name
Ejemplo n.º 14
0
def build_annoy_index(encoded, outfile):
    input_shape = encoded.shape
    f = input_shape[1]
    t = AnnoyIndex(f, metric='angular')  # Length of item vector that will be indexed
    for i,v in enumerate(encoded):
        t.add_item(i, v)

    t.build(100) # 10 trees
    if outfile is not None:
        t.save(outfile)

    return t
Ejemplo n.º 15
0
def create_profile(profile_name, image_folder, crop_width, crop_height, crop_increment):
    """
    given a folder and profile name, gather a series of subimages into a profile
    with which to create a collage
    """
    profile_folder = PROFILES_DIRECTORY + profile_name + "/"
    if not os.path.exists(profile_folder):
        os.makedirs(profile_folder)
    if not os.path.exists(profile_folder + "images/"):
        os.makedirs(profile_folder + "images/")
    image_file_list = [
        f for f in listdir(image_folder) if isfile(join(image_folder, f))]
    # todo: use crop ratio to calculate variable vector size
    nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean")
    image_index = []
    index = 0
    # iterate over images for processing into boxes and associated feature vectors
    for image_file in image_file_list:
        print("processing {}...").format(image_file),
        image_destination = profile_folder + "images/" + image_file
        copyfile(image_folder + image_file, image_destination)
        image = Image.open(image_destination)
        image_width, image_height = image.size[0], image.size[1]
        for x in xrange(0, image_width-crop_width, crop_increment):
            for y in xrange(0, image_height-crop_height, crop_increment):
                box = (x, y, x + crop_width, y + crop_height)
                image_sample = image.crop(box).resize(
                    SAMPLE_DIMENSION).convert("LA")  # dimensionality reduction
                gs_pixeldata = []  # reset feature vector
                # create feature vector for annoy
                for pixel in list(image_sample.getdata()):
                    gs_pixeldata.append(pixel[0])
                # add feature vector to annoy
                nns_index.add_item(index, gs_pixeldata)
                image_index.insert(
                    index, {"image": image_destination, "box": (x, y, x + crop_width, y + crop_height)})
                index += 1
        print("done.")
    # image_index[-1] holds profile metadata.
    image_index.append({"crop_width": crop_width, "crop_height": crop_height, "total_images": index-1})
    print("{} total subimages to be indexed...").format(str(index-1))
    print("building trees (this can take awhile)...")
    nns_index.build(TREE_SIZE)  # annoy builds trees
    print("done.")
    print("serializing trees..."),
    nns_index.save(profile_folder + profile_name + ".tree")
    print("done.")
    print("serializing index..."),
    pickle.dump(image_index, open(profile_folder + profile_name + ".p", "wb"))
    print("done.")
    print("{} profile completed. Saved in {}").format(
        profile_name, profile_folder)
    return
Ejemplo n.º 16
0
 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('x.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)
Ejemplo n.º 17
0
 def build(self, index_file, vectors, sender_urn):
     logger.info("Building {0}".format(index_file))
     logger.info("Vectors {0}".format(vectors))
     new_index = AnnoyIndex(self.feat_size, metric='euclidean')
     for idx, v in enumerate(vectors):
         logger.info("Adding item {0} with id {1}".format(v, idx))
         new_index.add_item(idx, v)
     new_index.build(self.n_trees)
     logger.info("Saving index file {0}".format(index_file))
     new_index.save(index_file)
     new_index.unload()
     pykka.ActorRegistry.get_by_urn(actor_urn=sender_urn).proxy().load()
     logger.info("Sent load command to worker")
Ejemplo n.º 18
0
	def run(self):
		#get ids
		with self.output()['ids'].open('w') as ids_fd:
			corpus = FeaCorpus(self.input()[0].fn, onlyID=True)
			for id in corpus:
				print >> ids_fd, id
		corpus = FeaCorpus(self.input()[0].fn, sparse=False)
		t = AnnoyIndex(self.n_components, metric='angular')
		i = 0
		for v in corpus:
			t.add_item(i, v)
			i += 1
		t.build(int(self.n_components / 2))
		t.save(self.output()['index'].fn)
Ejemplo n.º 19
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    f = data.base.shape[1]

    for ntrees in args.ntrees:
        t = AnnoyIndex(f)   # Length of item vector that will be indexed
        idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees)
        if not os.path.exists(idxpath):
            logging.info("Adding items ...")
            for i in xrange(data.nbae):
                t.add_item(i, data.base[i])
                if i % 100000 == 0:
                    logging.info("\t%d/%d" % (i, data.nbae))
            logging.info("\tDone!")
            logging.info("Building indexes ...")
            t.build(ntrees)
            logging.info("\tDone!")
            t.save(idxpath)
        else:
            logging.info("Loading indexes ...")
            t.load(idxpath)
            logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Ejemplo n.º 20
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Ejemplo n.º 21
0
 def test_save_load(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     i.build(10)
     i.save('blah.ann')
     j = AnnoyIndex(f, 'hamming')
     j.load('blah.ann')
     rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
     self.assertEquals(rs, [0, 1])
     self.assertAlmostEqual(ds[0], 0)
     self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
Ejemplo n.º 22
0
class ANN:
    def __init__(self, dimension):
        self.ann = AnnoyIndex(dimension)
    def addVectors(self,vectors):
        for idx,v in enumerate(vectors):
            self.ann.add_item(idx,v)
        self.ann.build(10)
    def query(self,vector):
        match = self.ann.get_nns_by_vector(vector,1)[0]
        # return self.ann.get_item_vector(match),match
        return match
    def save(self):
        self.ann.save("analogies.ann")
    def load(self,filename):
        self.ann.load(filename)
Ejemplo n.º 23
0
def baseline_train(olddata, f, trees):
    """" olddata to train with using f number of features of the data and building an index with trees number of trees """
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    if os.path.isfile(saving_model):
        print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..."
        t.load(saving_model)
    else:
        print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..."
        for i in olddata.index:
            v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]])
            t.add_item(i, v)
        print "Building the trees..."
        t.build(trees)
        assert t.get_n_items() == olddata.shape[0]
        print "Saving the model..."
        t.save(saving_model)  # Can easily be loaded into memory later.
    return t
Ejemplo n.º 24
0
class SimilarStringStore:

    def __init__(self, **kwargs):

        self.transformer = FeatureGenerator(k=1)

        print(self.transformer.n_features)

        self.store = AnnoyIndex(self.transformer.n_features)

    def vectorize(self, s):
        return self.transformer.transform(s)

    def add(self, id, s):
        ''' add a string to index '''

        vector = self.transformer.transform(s)
        self.store.add_item(int(id), vector)
        return vector

    def build(self):
        self.store.build(500)

    def save(self, filename='store.knn'):
        self.store.save(filename)

    def build_and_save(self, filename='store.knn'):
        self.build()
        self.save(filename)

    def load(self, filename='store.knn'):
        self.store.load(filename)


    def query(self, s):
        ''' query index '''
        vector = self.transformer.transform(s)
        neighbors = self.store.get_nns_by_vector(vector, 40)
        return neighbors


    def remove(self, id):
        ''' remove a string from the index '''
        pass
Ejemplo n.º 25
0
def build_index_annoy(h5fname , dset,out='data.ann',trees = 128,lazy=True):
    #establish connection to HDF5 file
    h5f = h5py.File(h5fname,'r')
    if lazy:
        X = h5f[dset]
    else:
        X = h5f[dset][:]

    #get dimension
    f = X.shape[1]

    #initialize annoy
    t = AnnoyIndex(f,'angular')

    #iterate over features, add to annoy
    for i,v in enumerate(X):
        t.add_item(i, v)

    #build and save index
    t.build(trees)
    t.save(out)
Ejemplo n.º 26
0
    def test_overwrite_index(self):
        # Issue #335
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)
        t.save('test.ann')

        # Load index file
        t2 = AnnoyIndex(f)
        t2.load('test.ann')

        # Overwrite index file
        t3 = AnnoyIndex(f)
        for i in range(500):
            v = [random.gauss(0, 1) for z in range(f)]
            t3.add_item(i, v)
        t3.build(10)
        if os.name == 'nt':
            # Can't overwrite on Windows
            with self.assertRaises(IOError):
                t3.save('test.ann')
        else:
            t3.save('test.ann')
            # Get nearest neighbors
            v = [random.gauss(0, 1) for z in range(f)]
            nns = t2.get_nns_by_vector(v, 1000)  # Should not crash
Ejemplo n.º 27
0
    def merge_indicies(self, index_file_a, index_file_b, sender_urn):
        logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn))
        index_a = AnnoyIndex(self.feat_size, metric='euclidean')
        index_b = AnnoyIndex(self.feat_size, metric='euclidean')
        new_index = AnnoyIndex(self.feat_size, metric='euclidean')

        index_a.load(index_file_a)
        index_b.load(index_file_b)

        cnt = 0
        for i in range(index_a.get_n_items()):
            new_index.add_item(cnt, index_a.get_item_vector(i))
            cnt += 1

        for i in range(index_b.get_n_items()):
            new_index.add_item(cnt, index_b.get_item_vector(i))
            cnt += 1


        new_index_file = index_file_a + ".merged"

        index_a.unload()
        index_b.unload()

        new_index.build(self.n_trees)
        new_index.save(new_index_file)
        logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format(
                index_file_a,
                index_file_b,
                sender_urn,
                cnt))

        new_index.unload()
        pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction(
                new_index_file=new_index_file,
                index_file_a=index_file_a,
                index_file_b=index_file_b
        )
Ejemplo n.º 28
0
    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                url = 'http://www-nlp.stanford.edu/data/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy = AnnoyIndex(f, distance)
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                annoy.add_item(i, v);
                
            annoy.build(10)
            annoy.save(output)

        annoy = AnnoyIndex(f, distance)
        annoy.load(output)
        return annoy
Ejemplo n.º 29
0
    def test_load_save_get_item_vector(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [1.1, 2.2, 3.3])
        i.add_item(1, [4.4, 5.5, 6.6])
        i.add_item(2, [7.7, 8.8, 9.9])
 
        numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3])
        self.assertTrue(i.build(10))
        self.assertTrue(i.save('blah.ann'))
        numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6])
        j = AnnoyIndex(f)
        self.assertTrue(j.load('blah.ann'))
        numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])
Ejemplo n.º 30
0
def create_index(file_list, start_count,model_filename, redis_index_file):
    f = 100
    t = AnnoyIndex(f)
    t.verbose(True)
    redisindex = open("/raid/ankit/"+redis_index_file,"w")
    i = start_count
    for f in file_list:
        print "Processing {} ...".format(f)
        with open(query_vectors_directory+f) as cur_f:
            for line in cur_f:
                #print line

                if not line.strip():
                    continue

                if i%1000000 == 0:
                    print "{} lines complete.".format(i)
                query, vector = line.split('\t')
                vector = normalize_redis_vector(vector)
                redisindex.write(str(query)+"\t\t"+str(i)+"\n")
                try:
                    t.add_item(i,vector)
                except:
                    print "Exception : "+ str(line)
                    pass
                #print i
                i+=1

    print "Done adding items, now starting to build 10 trees.."
    t.build(10)
    print "Saving Model on Disk..."
    t.save('/raid/ankit/ann_models/'+model_filename)

    print "Finished Building and Saving Model!"
    redisindex.close()

    return i
Ejemplo n.º 31
0
    img_path = file
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    fc2_features = model.predict(x)
    annoy_model.add_item(numimg, fc2_features[0])
    print(id)
    numimg += 1

print('num files=' + str(numimg))

#ビルドして、ファイルとして保存する
annoy_model.build(numimg)
save_path = os.path.join(base_dir, "result.ann")
annoy_model.save(save_path)

# 確認する
#annoy_model.unload()
#trained_model.load("D:/python/annoy/images_next.ann")

trained_model = AnnoyIndex(4096)
trained_model.load('D:/python/annoy/images_next.ann') # モデルを読み込むことも可能です。
print(trained_model.get_nns_by_item(0, 6000)) # インデックス0付近の1000個のデータの返します
items = trained_model.get_nns_by_item(1, 6, search_k=-1, include_distances=False)
print(items)


####txt出力####
txt1 = 'D:/python/annoy/test.csv'
file = open(txt1, "w", encoding = "utf_8")
Ejemplo n.º 32
0
class AnnoyTools:
    def __init__(self, config, is_build_annoy=False):
        self.word_embedding_path = config['word_embedding_path']
        self.annoy_path = config['annoy_path']
        self.word2index = {}
        self.index2word = {}
        self.annoy_index = AnnoyIndex(768, 'angular')
        self.word2index_path = config['word2index_path']
        self.index2word_path = config['index2word2_path']
        self.annoy_tree_num = config['annoy_tree_num']
        if is_build_annoy:
            self._save_annoy_index()
            logger.info("Build and save annoy index.")
        else:
            self._load_annoy_index()
            logger.info("Load the saved annoy index.")

    def _save_annoy_index(self):
        try:
            with codecs.open(self.word_embedding_path, 'r',
                             encoding="utf-8") as f:
                count = 0
                for line in f:

                    count += 1
                    result = line.strip('\n').split()
                    if len(result) == 2: continue
                    word = result[0]
                    # index2word[count] = word
                    self.word2index[word] = count
                    vector = list(map(float, result[1:]))
                    self.annoy_index.add_item(count, vector)
        except Exception as e:
            logger.info(e)
        self.index2word = {v: k for k, v in self.word2index.items()}
        with open(self.word2index_path,
                  'wb') as f1, open(self.index2word_path, 'wb') as f2:
            pickle.dump(self.word2index, f1, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.index2word, f2, protocol=pickle.HIGHEST_PROTOCOL)
        self.annoy_index.build(self.annoy_tree_num)
        logger.info("Save annoy tree done.")
        self.annoy_index.save(self.annoy_path)

    def _load_annoy_index(self):
        self.annoy_index.load(self.annoy_path)
        with open(self.word2index_path,
                  "rb") as f1, open(self.index2word_path, 'rb') as f2:
            self.word2index = pickle.load(f1)
            self.index2word = pickle.load(f2)
            logger.info("Loaded the saved word2index and index2word.")

    def get_similar_by_query(self, query, topk=21):
        query_vec = bc.encode([query])[0]
        idxes, dists = self.annoy_index.get_nns_by_vector(
            query_vec, topk, include_distances=True)
        idxes = [self.index2word[i] for i in idxes]
        similars = list(zip(idxes, dists))

        result = [(i, 0.1 * (abs(1 - score)) + 0.5)
                  for i, score in zip(idxes, dists)]
        print(result)
        return similars

    def _read_vector(self):
        model = KeyedVectors.load_word2vec_format("words.vector", binary=True)
        model.wv.save_word2vec_format(self.word_embedding_path, binary=False)
Ejemplo n.º 33
0
class FMClassifier:
    """Class implementing the Features Matching Classifier (FMClassifier)

    Args:
        catalog_path (string): [description]
        params (dict): [description]
    """

    ##########################
    # Init
    ##########################

    def __init__(self, catalog_path: str, params: Dict = {}):
        self.catalog_path = catalog_path
        self._config_classifier(catalog_path, params)

    ##########################
    # Config
    ##########################

    def _config_classifier(self, catalog_path, params):
        self._get_classifier_config(params)
        self._get_catalog_images(catalog_path)
        self._get_catalog_labels(catalog_path)
        self._get_catalog_images2labels()
        self._load_fingerprints()

    def _get_classifier_config(self, params):
        self.config = edict({
            "verbose":
            params.get("verbose", constants.VERBOSE),
            "feature_descriptor":
            params.get("feature_descriptor", constants.FEATURE_DESCRIPTOR),
            "feature_dimension":
            params.get("feature_dimension", constants.FEATURE_DIMENSION),
            "image_size":
            params.get("image_size", constants.IMAGE_SIZE),
            "keypoint_stride":
            params.get("keypoint_stride", constants.KEYPOINT_STRIDE),
            "keypoint_sizes":
            params.get("keypoint_sizes", constants.KEYPOINT_SIZES),
            "matcher_path":
            params.get("matcher_path", constants.MATCHER_PATH),
            "matcher_distance":
            params.get("matcher_distance", constants.MATCHER_DISTANCE),
            "matcher_n_trees":
            params.get("matcher_n_trees", constants.MATCHER_N_TREES),
            "scoring":
            params.get("scoring", constants.SCORING),
            "k_nn":
            params.get("k_nn", constants.K_NN),
            "fingerprint_path":
            params.get("fingerprint_path", constants.FINGERPRINT_PATH),
        })

    def _get_catalog_images(self, catalog_path):
        self.catalog_images = utils.get_all_images_from_folder(catalog_path)

    def _get_catalog_labels(self, catalog_path):
        self.catalog_labels = utils.get_labels_from_catalog(catalog_path)

    def _get_catalog_images2labels(self):
        self.catalog_images2labels = utils.compute_images2labels(
            self.catalog_images, self.catalog_labels)

    def _load_fingerprints(self):
        # Previous fingerprint
        if os.path.exists(self.config.fingerprint_path):
            with open(self.config.fingerprint_path, "rb") as pickle_file:
                self.config.fingerprint = pickle.load(pickle_file)
        else:
            self.config.fingerprint = ""

        # Current fingerprint
        self.fingerprint = fm_utils.compute_fingerprint(
            self.catalog_path, self.config)

    ##########################
    # Train
    ##########################

    def train(self):
        """Method used to train the classifier.
        """
        # Init matcher
        self.matcher = AnnoyIndex(self.config.feature_dimension,
                                  self.config.matcher_distance)

        # Create or load matcher
        if self._should_create_index():
            self._create_matcher_index()
            self._save_matcher_index()
            self._save_fingerprint()
        else:
            self._load_matcher_index()

    def _should_create_index(self):
        fingerprint_changed = self.config.fingerprint != self.fingerprint
        matcher_file_exists = os.path.isfile(self.config.matcher_path)
        return fingerprint_changed or (not matcher_file_exists)

    def _create_matcher_index(self):
        # Get descriptors
        catalog_descriptors = self._get_catalog_descriptors()

        # Get iterator
        descriptors_iterator = utils.get_iterator(
            catalog_descriptors,
            verbose=self.config.verbose,
            description="Creating Index...")

        # Config matcher
        for k, descriptor in enumerate(descriptors_iterator):
            self.matcher.add_item(k, descriptor)
        self.matcher.build(self.config.matcher_n_trees)

    def _get_catalog_descriptors(self):
        # Init descriptors list
        catalog_descriptors = []

        # Init iterator
        iterator = utils.get_iterator(
            utils.get_all_images_from_folder(self.catalog_path),
            verbose=self.config.verbose,
            description="Computing catalog descriptors")

        # Compute all descriptors
        for path in iterator:
            # Read image
            img = utils.read_image(path, size=self.config.image_size)

            # Compute keypoints
            keypoints = utils.compute_keypoints(img,
                                                self.config.keypoint_stride,
                                                self.config.keypoint_sizes)

            # Compute descriptors
            descriptors = utils.compute_descriptors(
                img, keypoints, self.config.feature_descriptor)

            # Update descriptors list
            catalog_descriptors.append(descriptors)

        # Reshape descriptors list
        catalog_descriptors = np.array(catalog_descriptors)
        catalog_descriptors = catalog_descriptors.reshape(
            -1, catalog_descriptors.shape[-1])

        return catalog_descriptors

    def _save_matcher_index(self):
        matcher_folder = "/".join(self.config.matcher_path.split("/")[:-1])
        if not os.path.exists(matcher_folder):
            os.makedirs(matcher_folder)
        if self.config.verbose:
            print("Saving Index...")
        self.matcher.save(self.config.matcher_path)

    def _load_matcher_index(self):
        if self.config.verbose:
            print("Loading Index...")
        self.matcher.load(self.config.matcher_path)

    def _save_fingerprint(self):
        fingerprint_folder = "/".join(
            self.config.fingerprint_path.split("/")[:-1])
        if not os.path.exists(fingerprint_folder):
            os.makedirs(fingerprint_folder)
        with open(self.config.fingerprint_path, "wb") as pickle_file:
            pickle.dump(self.fingerprint, pickle_file)

    ##########################
    # Predict
    ##########################

    def predict(self, query_path: str) -> np.array:
        """Method used to predict a score per class for a given query.

        Args:
            query_path (str): The local path of the query.

        Returns:
            np.array: The list of scores per class.
        """
        # Read img
        query_img = utils.read_image(query_path, size=self.config.image_size)

        # Get keypoints
        query_keypoints = utils.compute_keypoints(query_img,
                                                  self.config.keypoint_stride,
                                                  self.config.keypoint_sizes)

        # Get descriptors
        query_descriptors = utils.compute_descriptors(
            query_img, query_keypoints, self.config.feature_descriptor)

        # Get scores
        scores = self._get_query_scores(query_descriptors)

        # To numpy
        scores = np.array(scores)

        return scores

    def predict_batch(self, query_paths: List[str]) -> np.array:
        """Method used to predict a class for a batch of queries.

        Args:
            query_paths (List[str]): The list of all query paths.

        Returns:
            np.array: The scores per class for each query.
        """
        # Init scores
        scores = []

        # Get iterator
        iterator = utils.get_iterator(query_paths,
                                      verbose=self.config.verbose,
                                      description="Prediction of all queries")

        # Loop over all queries
        for query_path in iterator:
            # Predict score of query
            query_scores = self.predict(query_path)

            # Update scores
            scores.append(query_scores)

        # To numpy
        scores = np.array(scores)

        return scores

    def _get_query_scores(self, query_descriptors):
        # Init scores variables
        scores = np.zeros((len(self.catalog_labels)))
        n_desc = query_descriptors.shape[0]

        # Compute matches
        train_idx, distances = self._compute_query_matches(query_descriptors)

        # Compute score matrix
        scores_matrix = self._compute_scores_matrix(distances)

        # Compute final scores
        for ind, nn_train_idx in enumerate(train_idx):
            for k, idx in enumerate(nn_train_idx):
                # Get image_path
                image_path = self.catalog_images[int(idx // n_desc)]

                # Get image_label
                image_label = self.catalog_images2labels[image_path]

                # Get label_idx
                label_idx = self.catalog_labels.index(image_label)

                # Update score
                scores[label_idx] += scores_matrix[ind, k]

        return scores

    def _compute_query_matches(self, query_descriptors):
        # Init matches variables
        n_matches = query_descriptors.shape[0]
        train_idx = np.zeros((n_matches, self.config.k_nn))
        distances = np.zeros((n_matches, self.config.k_nn))

        # Compute matches
        for i, descriptor in enumerate(query_descriptors):
            idx, dist = self.matcher.get_nns_by_vector(descriptor,
                                                       self.config.k_nn,
                                                       include_distances=True)
            train_idx[i] = idx
            distances[i] = dist

        return train_idx, distances

    def _compute_scores_matrix(self, distances):
        if self.config.scoring == "distance":
            return self._compute_scores_matrix_distance(distances)
        if self.config.scoring == "count":
            return self._compute_scores_matrix_count(distances)
        return self._compute_scores_matrix_distance(distances)

    def _compute_scores_matrix_distance(self, distances):
        return np.exp(-distances**2)

    def _compute_scores_matrix_count(self, distances):
        scores_matrix = np.zeros(distances.shape)
        for k in range(self.config.k_nn):
            scores_matrix[:, k] = 1 - k / self.config.k_nn
        return scores_matrix

    ##########################
    # Utils
    ##########################

    def label_id2str(self, label_id: int) -> str:
        """Gets the label_str given the label_id.

        Args:
            label_id (int): The given label_id.

        Returns:
            str: The label_str of the given label_id.
        """
        return self.catalog_labels[label_id]

    def label_str2id(self, label_str: str) -> int:
        """Gets the label_id given the label_str.

        Args:
            label_str (str): The given label_str.

        Returns:
            int: The label_id of the given label_id.
        """
        if label_str in self.catalog_labels:
            return self.catalog_labels.index(label_str)
        return -1
Ejemplo n.º 34
0
class Face:
    def __init__(self, app):
        self.storage = app.config["storage"]
        self.db = app.db
        self.faces = []  # storage all faces in caches array of face object
        self.known_encoding_faces = []  # faces data for recognition
        self.face_user_keys = {}
        self.load_all()

    def load_user_by_index_key(self, index_key=0):

        key_str = str(index_key)

        if key_str in self.face_user_keys:
            return self.face_user_keys[key_str]

        return None

    def load_train_file_by_name(self, name):
        trained_storage = path.join(self.storage, 'trained')
        return path.join(trained_storage, name)

    def load_unknown_file_by_name(self, name):
        unknown_storage = path.join(self.storage, 'unknown')
        unknown_storage_face = path.join(self.storage, 'unknown_face')
        return (path.join(unknown_storage,
                          name), path.join(unknown_storage_face, name))

    def load_all(self):

        results = self.db.select(
            'SELECT faces.id, faces.user_id, faces.filename, faces.created FROM faces'
        )
        self.layer_size = 0
        count = 0
        for row in results:

            user_id = row[1]
            filename = row[2]

            face = {
                "id": row[0],
                "user_id": user_id,
                "filename": filename,
                "created": row[3]
            }
            self.faces.append(face)

            face_image = face_recognition_api.load_image_file(
                self.load_train_file_by_name(filename))
            face_image_encoding = face_recognition_api.face_encodings(
                face_image)[0]
            index_key = len(self.known_encoding_faces)
            self.known_encoding_faces.append(face_image_encoding)
            index_key_string = str(index_key)
            self.face_user_keys['{0}'.format(index_key_string)] = user_id
            print('user_id', user_id)
            if count == 0:
                self.layer_size = len(face_image_encoding)
                self.tree = AnnoyIndex(self.layer_size,
                                       metric)  # prepare index
            self.tree.add_item(user_id, face_image_encoding)
            count += 1
        print 'building index...\n'
        if self.layer_size > 0:
            print 'layer_size=', self.layer_size
            self.tree.build(ntrees)
            self.tree.save('index.ann')

    def recognize(self, unknown_filename):
        tree = loadannoy()
        (unfile,
         unfile_face) = self.load_unknown_file_by_name(unknown_filename)

        unknown_image = face_recognition_api.load_image_file(unfile)
        unknown_encoding_image = face_recognition_api.face_encodings(
            unknown_image)[0]

        #results = face_recognition.compare_faces(self.known_encoding_faces, unknown_encoding_image);
        results2 = find_matching_id(unknown_encoding_image, tree)
        guess_age = age_predict.predict([unfile_face])
        guess_gender = gender_predict.predict([unfile_face])
        #print("results", results)
        print("results2", results2)
        if results2:
            matching_id, min_dist = results2
            user_id = matching_id  #self.load_user_by_index_key(matching_id)
            return (user_id, guess_age, guess_gender)

        return ('unknown', guess_age, guess_gender)
        '''
Ejemplo n.º 35
0
class MLSAT_ANNSet(Dataset):
    @RedirectWrapper(target_cli=CLI)
    def __init__(self, K=15, train=False):

        self.load(train=train)

        # Run Approximate NN search

        self.annoy = AnnoyIndex(self.dim, 'euclidean')  # 'angular' ?

        if os.path.exists(self.annpth):
            self.annoy.load(self.annpth)
            print("Loaded ANN indices from %s" % self.annpth)

        else:
            print("Creating ANN indices ...")

            self.X = self.X.view(-1, self.dim)  # 28 * 28 = 784

            for i, x in enumerate(self.X):
                self.annoy.add_item(i, x)

            self.annoy.build(128)
            self.annoy.save(self.annpth)

            print("ANN index complete")

        self.size = len(self.X)
        self.X = self.X.view(-1, *self.shape).contiguous().numpy()
        # self.X = self.X.view(-1, 784).contiguous().numpy()
        self.K = K

        self.ANNIdx = np.zeros((len(self.Y), self.K)).astype(np.int32)
        for i in range(len(self.Y)):
            self.ANNIdx[i] = self.annoy.get_nns_by_item(i, self.K + 1)[1:]

    def load(self, train=False):
        raise NotImplementedError

    def __getitem__(self, idx):

        # No GCN now
        # neighbors = self.annoy.get_nns_by_item(idx, self.K+1)[1:]
        neighbors = self.ANNIdx[idx]
        edge_index = torch.tensor(
            [[0] * self.K, [i for i in range(1, self.K + 1)]],
            dtype=torch.long)  # 1-hop neighbor
        x = torch.tensor([self.X[idx]] + [self.X[i] for i in neighbors],
                         dtype=torch.float)

        sid = []
        for i in range(len(x)):
            sid.append(i if i <= self.K else (self.K + 1))

        scatter_idx = torch.tensor(sid, dtype=torch.long)
        raw_edges = torch.tensor([[idx, i] for i in neighbors],
                                 dtype=torch.long)
        center = torch.tensor([self.X[idx]], dtype=torch.float)

        data = Data(x=x,
                    edge_index=edge_index,
                    scatter_idx=scatter_idx,
                    raw_edges=raw_edges,
                    center=center)

        return data

    def generateGraph(self, path):

        G = nx.Graph()
        for i, x in enumerate(self.X):
            G.add_node(i, digit=self.Y[i].item())
        for i in range(len(self.X)):
            for j in self.annoy.get_nns_by_item(i, self.K + 1)[1:]:
                G.add_edge(i, j)

        nx.write_gexf(G, path)

    def __len__(self):
        return self.size
Ejemplo n.º 36
0
class AnnoySearch(object):
    def __init__(self,
                 input_file=None,
                 model_path=None,
                 dict_path=None,
                 vec_dim=128,
                 tree_num=10):
        self.input_file = input_file
        self.model_path = model_path
        self.dict_path = dict_path
        self.vec_dim = vec_dim
        self.tree_num = tree_num
        self._vecs_train = []
        self._ids = []
        self._id_index = dict()
        self._index_id = dict()

        self._annoy_tree = None
        self.__load()

    def __load(self):
        if self.input_file:
            with open(self.input_file, 'r') as f:
                for line in f:
                    arr = line.strip().split(' ')
                    id = arr[0]
                    vec = [float(sub) for sub in arr[1:]]
                    self._vecs_train.append(vec)
                    self._ids.append(id)
        if self.model_path and self.dict_path:
            self._annoy_tree = AnnoyIndex(self.vec_dim)
            self._annoy_tree.load(self.model_path)
            dict_file = open(self.dict_path, 'rb')
            dict_list = pickle.load(dict_file)
            self._id_index, self._index_id = dict_list

    def build_tree(self):
        self._annoy_tree = AnnoyIndex(self.vec_dim)
        for index, id in enumerate(self._ids):
            self._id_index[id] = index
            self._index_id[index] = id
        for index, vec_train in enumerate(self._vecs_train):
            #print vec_train
            self._annoy_tree.add_item(index, vec_train)
        self._annoy_tree.build(self.tree_num)

    def save_tree(self, model_path, dict_path):
        self._annoy_tree.save(model_path)
        dict_file = open(dict_path, 'wb')
        pickle.dump([self._id_index, self._index_id], dict_file)
        dict_file.close()

    def find_nns_by_id(self,
                       id,
                       n_items=40,
                       search_k=-1,
                       include_distances=False):
        index = self._id_index[id]
        #print index
        if self._annoy_tree and self._id_index:
            ids_found = []
            res_found = self._annoy_tree.get_nns_by_item(
                index,
                n_items,
                search_k=search_k,
                include_distances=include_distances)
            #print res_found
            if include_distances:
                for index, dist in zip(res_found[0], res_found[1]):
                    id_found = self._index_id[index]
                    res = (id_found, dist)
                    ids_found.append(res)
            else:
                for index in res_found:
                    id_found = self._index_id[index]
                    ids_found.append(id_found)
        return ids_found

    def print_nns_by_file(self,
                          id_file,
                          n_items,
                          search_k=-1,
                          include_distances=False):
        for line in open(id_file, 'r').readlines():
            id = line.strip()
            index = self._id_index[id]
            res_found = self._annoy_tree.get_nns_by_item(
                index,
                n_items,
                search_k=search_k,
                include_distances=include_distances)
            #ids_found = self._index_id[index_found]

            if include_distances:
                for index, dist in zip(res_found[0], res_found[1]):
                    id_found = self._index_id[index]
                    print '%s\t%s\t%s' % (id, id_found, str(dist))
            else:
                for index in res_found:
                    id_found = self._index_id[index]
                    print '%s\t%s' % (id, id_found)
if __name__ == '__main__':
    EMBEDDING_PATH = 'data/materials/zh.300.vec.gz'
    DEFAULT_KEYVEC = KeyedVectors.load_word2vec_format(EMBEDDING_PATH,
                                                       limit=50000)

    id2word = {i: word for i, word in enumerate(DEFAULT_KEYVEC.index2word)}
    word2id = {word: i for i, word in enumerate(DEFAULT_KEYVEC.index2word)}

    n_trees = 100
    emb_dim = 300
    ann_index = AnnoyIndex(emb_dim, metric='angular')
    for i, word in enumerate(DEFAULT_KEYVEC.index2word):
        vec = DEFAULT_KEYVEC.get_vector(word)
        ann_index.add_item(i, vec)

    ann_index.build(n_trees)
    ann_index.save('data/index/annoy.cosine.idx')
    pickle_dump(id2word, 'data/index/id2word.pkl')
    pickle_dump(word2id, 'data/index/word2id.pkl')

    with open('data/index/annoy.cosine.10neighbors.txt', 'w',
              encoding='utf-8') as wf:
        for i, word in enumerate(DEFAULT_KEYVEC.index2word):
            cur_word = id2word[i]
            neighbors = [
                id2word[id] for id in ann_index.get_nns_by_item(i, 11)
            ][1:]  # 第一个是自己,去掉
            wf.write('%s\t%s\n' %
                     (cur_word, json.dumps(neighbors, ensure_ascii=False)))
Ejemplo n.º 38
0
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5')
print('fetched model.')

r = redis.Redis(host='127.0.0.1', port=6379)

D = 512
NUM_TREES = 10
ann = AnnoyIndex(D, metric='angular')
embedding_counter = 0

texts = []

with open('wiki/AA/wiki_00') as f:
    for line_index, line in enumerate(f):
        # print(line)
        embeddings = embed(line)
        print(embeddings)
        ann.add_item(line_index, embeddings[0])
        if line_index == 0:
            texts.append(line)
            break

        # data.append(json.loads(line))
        # ann.add_item(embedding_counter, e)
        # embedding_counter += 1
embeddings = embed(texts)

ann.build(NUM_TREES)
ann.save('wiki_articles.index')
Ejemplo n.º 39
0
 def test_fail_save(self):
     t = AnnoyIndex(40, 'angular')
     with self.assertRaises(IOError):
         t.save('')
Ejemplo n.º 40
0
 def test_save_twice(self):
     # Issue #100
     t = AnnoyIndex(10)
     t.save("t.ann")
     t.save("t.ann")
Ejemplo n.º 41
0
    for i in range(0,160):
        for j in range(0,320):
            feat=extract_features("https://gibs.earthdata.nasa.gov/wmts/epsg4326/best/MODIS_Terra_CorrectedReflectance_TrueColor/default/2005-08-29/250m/8/"+str(i)+"/"+str(j)+".jpg", model1)
            print("length",len(feat))
            features.append(feat)
            print(str(i)+" "+str(j))
            
        time.sleep(30)
    


     # Length of item vector that will be indexed
    t=AnnoyIndex(len(features[0]))
    for p in range(len(features)):
        feature = features[p]
        t.add_item(p, feature)

    t.build(40)  # 40 trees
    t.save('hurricanes1.ann')

except:
    print("Error Occurred, indexing")
    t=AnnoyIndex(features[0])
    for p in range(len(features)):
        feature = features[p]
        t.add_item(p, feature)

    t.build(40)  # 40 trees
    t.save('hurricanes1.ann')

Ejemplo n.º 42
0
class AnnoyIndexer(BaseChunkIndexer):
    def __init__(self,
                 num_dim: int,
                 data_path: str,
                 metric: str = 'angular',
                 n_trees=10,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.num_dim = num_dim
        self.data_path = data_path
        self.metric = metric
        self.n_trees = n_trees
        self._key_info_indexer = ListKeyIndexer()

    def post_init(self):
        from annoy import AnnoyIndex
        self._index = AnnoyIndex(self.num_dim, self.metric)
        try:
            if not os.path.exists(self.data_path):
                raise FileNotFoundError('"data_path" is not exist')
            if os.path.isdir(self.data_path):
                raise IsADirectoryError(
                    '"data_path" must be a file path, not a directory')
            self._index.load(self.data_path)
        except:
            self.logger.warning(
                'fail to load model from %s, will create an empty one' %
                self.data_path)

    def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray,
            weights: List[float], *args, **kwargs):
        last_idx = self._key_info_indexer.size

        if len(vectors) != len(keys):
            raise ValueError('vectors length should be equal to doc_ids')

        if vectors.dtype != np.float32:
            raise ValueError("vectors should be ndarray of float32")

        for idx, vec in enumerate(vectors):
            self._index.add_item(last_idx + idx, vec)

        self._key_info_indexer.add(keys, weights)

    def query(self, keys: 'np.ndarray', top_k: int, *args,
              **kwargs) -> List[List[Tuple]]:
        self._index.build(self.n_trees)
        if keys.dtype != np.float32:
            raise ValueError('vectors should be ndarray of float32')
        res = []
        for k in keys:
            ret, relevance_score = self._index.get_nns_by_vector(
                k, top_k, include_distances=True)
            relevance_score = self.normalize_score(relevance_score,
                                                   self.metric)
            chunk_info = self._key_info_indexer.query(ret)
            res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)])
        return res

    def normalize_score(self, score: List[float], metrics: str, *args,
                        **kwargs) -> List[float]:
        if metrics == 'angular':
            return list(map(lambda x: 1 / (1 + x), score))
        elif metrics == 'euclidean':
            import math
            return list(
                map(lambda x: 1 / (1 + math.sqrt(x) / self.num_dim), score))
        elif metrics == 'manhattan':
            return list(map(lambda x: 1 / (1 + x / self.num_dim), score))
        elif metrics == 'hamming':
            return list(map(lambda x: 1 / (1 + x), score))
        elif metrics == 'dot':
            raise NotImplementedError

    @property
    def size(self):
        return self._index.get_n_items()

    def __getstate__(self):
        d = super().__getstate__()
        self._index.save(self.data_path)
        return d
Ejemplo n.º 43
0
    xrange
except NameError:
    # Python 3 compat
    xrange = range

n, f = 100000, 40

t = AnnoyIndex(f)
for i in xrange(n):
    v = []
    for z in xrange(f):
        v.append(random.gauss(0, 1))
    t.add_item(i, v)

t.build(2 * f)
t.save('test.tree')

limits = [10, 100, 1000, 10000]
k = 10
prec_sum = {}
prec_n = 1000
time_sum = {}

for i in xrange(prec_n):
    j = random.randrange(0, n)
    print('finding nbs for', j)
        
    closest = set(t.get_nns_by_item(j, k, n))
    for limit in limits:
        t0 = time.time()
        toplist = t.get_nns_by_item(j, k, limit)
Ejemplo n.º 44
0
def convert(input_file_path,
            output_file_path=None,
            precision=DEFAULT_PRECISION,
            subword=False,
            subword_start=DEFAULT_NGRAM_BEG,
            subword_end=DEFAULT_NGRAM_END,
            approx=False,
            approx_trees=None):

    files_to_remove = []
    subword = int(subword)
    approx = int(approx)

    # If no output_file_path specified, create it in a tempdir
    if output_file_path is None:
        output_file_path = os.path.join(
            tempfile.gettempdir(),
            fast_md5_file(input_file_path) + '.magnitude')
        if os.path.isfile(output_file_path):
            try:
                conn = sqlite3.connect(output_file_path)
                db = conn.cursor()
                size = db.execute(
                "SELECT value FROM magnitude_format WHERE key='size'") \
                .fetchall()[0][0]
                conn, close()
                return output_file_path  # File already exists and is functioning
            except:
                pass

    # Check args
    input_is_text = input_file_path.endswith('.txt') or \
        input_file_path.endswith('.vec')
    input_is_binary = input_file_path.endswith('.bin')
    if not input_is_text and not input_is_binary:
        exit("The input file path must be .txt, .bin, or .vec")
    if not output_file_path.endswith('.magnitude'):
        exit("The output file path file path must be .magnitude")

    # Detect GloVE format and convert to word2vec if detected
    detected_GloVE = False
    if input_is_text:
        with io.open(input_file_path,
                     mode="r",
                     encoding="utf-8",
                     errors="ignore") as ifp:
            line1 = None
            line2 = None
            while line1 is None or line2 is None:
                line = ifp.readline().strip()
                if len(line) > 0:
                    if line1 is None:
                        line1 = line
                    elif line2 is None:
                        line2 = line
            line1 = line1.replace('\t', ' ')
            line2 = line2.replace('\t', ' ')
            line1 = line1.split()
            line2 = line2.split()
            if len(line1) == len(line2):  # No header line present
                detected_GloVE = True
    if detected_GloVE:
        eprint("Detected GloVE format! Converting to word2vec format first..."
               "(this may take some time)")
        temp_file_path = os.path.join(
            tempfile.gettempdir(),
            os.path.basename(input_file_path) + '.txt')
        try:
            import gensim
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert GloVE files.")
        gensim.scripts.glove2word2vec.glove2word2vec(input_file_path,
                                                     temp_file_path)
        input_file_path = temp_file_path
        files_to_remove.append(temp_file_path)

    # Open and load vector file
    eprint("Loading vectors... (this may take some time)")
    number_of_keys = None
    dimensions = None
    if input_is_binary:
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert binary files.")
        keyed_vectors = KeyedVectors.load_word2vec_format(
            input_file_path, binary=input_is_binary)
        number_of_keys = len(keyed_vectors.vectors)
        dimensions = len(keyed_vectors.vectors[0])
    else:
        # Read it manually instead of with gensim so we can stream large models
        class KeyedVectors:
            pass

        def keyed_vectors_generator():
            number_of_keys, dimensions = (None, None)
            f = io.open(input_file_path,
                        mode="r",
                        encoding="utf-8",
                        errors="ignore")
            first_line = True
            for line in f:
                line_split = line.strip().replace('\t', ' ').split()
                if len(line_split) == 0:
                    continue
                if first_line:
                    first_line = False
                    number_of_keys = int(line_split[0])
                    dimensions = int(line_split[1])
                    yield (number_of_keys, dimensions)
                else:
                    empty_key = len(line_split) == dimensions
                    vec_floats = line_split if empty_key else line_split[1:]
                    key = "" if empty_key else line_split[0]
                    if len(vec_floats) > dimensions:
                        key = " ".join([key] + \
                            vec_floats[0:len(vec_floats)-dimensions])
                        vec_floats = vec_floats[len(vec_floats) - dimensions:]
                    vector = np.asarray([float(elem) \
                        for elem in vec_floats])
                    yield (key, vector)

        keyed_vectors = KeyedVectors()
        kv_gen = keyed_vectors_generator()
        number_of_keys, dimensions = next(kv_gen)
        kv_gen_1, kv_gen_2 = tee(kv_gen)
        keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1)
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)

    eprint("Found %d key(s)" % number_of_keys)
    eprint("Each vector has %d dimension(s)" % dimensions)

    # Connect to magnitude datastore
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")
    conn = sqlite3.connect(output_file_path)
    files_to_remove.append(output_file_path + "-shm")
    files_to_remove.append(output_file_path + "-wal")
    db = conn.cursor()

    # Make the database fast
    conn.isolation_level = None
    db.execute("PRAGMA synchronous = OFF;")
    db.execute("PRAGMA default_synchronous = OFF;")
    db.execute("PRAGMA journal_mode = WAL;")
    db.execute("PRAGMA count_changes = OFF;")

    # Create table structure
    eprint("Creating magnitude format...")
    db.execute("DROP TABLE IF EXISTS `magnitude`;")
    db.execute("""
        CREATE TABLE `magnitude` (
            key TEXT COLLATE NOCASE,
            """ + ",\n".join([("dim_%d INTEGER" % i)
                              for i in range(dimensions)]) + """
        );
    """)
    db.execute("""
        CREATE TABLE `magnitude_format` (
            key TEXT COLLATE NOCASE,
            value INTEGER
        );
    """)
    if subword:
        db.execute("""
            CREATE VIRTUAL TABLE `magnitude_subword` 
            USING fts3(
                char_ngrams,
                num_ngrams
            );
        """)
    if approx:
        db.execute("""
            CREATE TABLE `magnitude_approx` (
                trees INTEGER,
                index_file BLOB
            );
        """)

    # Create annoy index
    approx_index = None
    if approx:
        approx_index = AnnoyIndex(dimensions)

    # Write vectors
    eprint("Writing vectors... (this may take some time)")
    insert_query = """
        INSERT INTO `magnitude`(
            key,
            """ + \
        ",\n".join([("dim_%d" % i) for i in range(dimensions)]) \
        + """) 
        VALUES (
            """ + \
        (",\n".join(["?"] * (dimensions + 1))) \
        + """
        );
    """
    insert_subword_query = """
        INSERT INTO `magnitude_subword`(
            char_ngrams,
            num_ngrams
        ) 
        VALUES (
            ?, ?
        );
    """
    counters = [Counter() for i in range(dimensions)]
    key_vectors_iterable = zip(keyed_vectors.index2word, keyed_vectors.vectors)
    progress = -1
    db.execute("BEGIN;")
    for i, (key, vector) in enumerate(key_vectors_iterable):
        current_progress = int((float(i) / float(number_of_keys)) * 100)
        if current_progress > progress:
            progress = current_progress
            eprint("%d%% completed" % progress)
        if i % 100000:
            db.execute("COMMIT;")
            db.execute("BEGIN;")
        vector = vector / np.linalg.norm(vector)
        for d, v in enumerate(vector):
            counters[d][int(v * 100)] += 1
        db.execute(insert_query, (key,) + tuple(int(round(v*(10**precision))) \
            for v in vector))
        if subword:
            ngrams = set(
                (n.lower()
                 for n in char_ngrams(BOW + key +
                                      EOW, subword_start, subword_end)))
            num_ngrams = len(ngrams) * 4
            ngrams = set(
                (n for n in ngrams
                 if not any([c in SQLITE_TOKEN_SPLITTERS for c in n])))
            db.execute(insert_subword_query, (" ".join(ngrams), num_ngrams))
        if approx:
            approx_index.add_item(i, vector)
    eprint("Committing written vectors... (this may take some time)")
    db.execute("COMMIT;")

    # Figure out which dimensions have the most entropy
    entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)]
    entropies.sort(key=lambda e: e[1], reverse=True)
    for e in entropies:
        eprint("Entropy of dimension %d is %f" % (e[0], e[1]))
    highest_entropy_dimensions = [e[0] for e in entropies]

    # Writing metadata
    insert_format_query = """
        INSERT INTO `magnitude_format`(
            key,
            value
        ) 
        VALUES (
            ?, ?
        );
    """

    db.execute(insert_format_query, ('size', number_of_keys))
    db.execute(insert_format_query, ('dim', dimensions))
    db.execute(insert_format_query, ('precision', precision))
    if subword:
        db.execute(insert_format_query, ('subword', subword))
        db.execute(insert_format_query, ('subword_start', subword_start))
        db.execute(insert_format_query, ('subword_end', subword_end))
    if approx:
        if approx_trees is None:
            approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0))
        db.execute(insert_format_query, ('approx', approx))
        db.execute(insert_format_query, ('approx_trees', approx_trees))
    for d in highest_entropy_dimensions:
        db.execute(insert_format_query, ('entropy', d))

    # Create indicies
    eprint("Creating search index... (this may take some time)")
    db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);")
    for i in highest_entropy_dimensions[0:1]:
        eprint("Creating spatial search index for dimension %d "
               "(it has high entropy)... (this may take some time)" % i)
        db.execute("""
            CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d);
        """ % (i, i))

    # Write approximate index to the database
    if approx:
        eprint("Creating approximate nearest neighbors index... \
(this may take some time)")
        approx_index.build(approx_trees)
        approx_index_file_path = os.path.join(
            tempfile.gettempdir(),
            fast_md5_file(input_file_path) + '.ann')
        eprint("Dumping approximate nearest neighbors index... \
(this may take some time)")
        approx_index.save(approx_index_file_path)
        eprint("Compressing approximate nearest neighbors index... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(approx_index_file_path)
        insert_approx_query = """
            INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?);
        """
        with open(approx_index_file_path, 'rb') as ifh, \
            lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size), '')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_approx_query,
                               (approx_trees, sqlite3.Binary(chunk)))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_approx_query,
                           (approx_trees, sqlite3.Binary(chunk)))
        files_to_remove.append(approx_index_file_path)

    # VACUUM
    eprint("Vacuuming to save space... (this may take some time)")
    db.execute("VACUUM;")

    # Restore safe database settings
    db.execute("PRAGMA synchronous = FULL;")
    db.execute("PRAGMA default_synchronous = FULL;")
    db.execute("PRAGMA journal_mode = DELETE;")
    db.execute("PRAGMA count_changes = ON;")

    # Clean up connection
    conn.commit()
    conn.close()

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Print success
    eprint("Successfully converted '%s' to '%s'!" %
           (input_file_path, output_file_path))

    return output_file_path
Ejemplo n.º 45
0
    def start_extraction(self):
        print('Start extraction')

        file_path = os.path.dirname(os.path.abspath(__file__))

        file_name_process = os.path.join(file_path,
                                         'export/nodes_export_process.json')

        text_arr = None

        # Für das Abarbeiten von Nodes aus der Datei nodes_export.json wird diese Datei zu nodes_export_process.json kopiert. Somit können aus Drupal heraus Nodes
        # exportiert werden, während aus der Datei nodes_export_process.json noch Nodes verarbeitet werden und es wird nicht in derselben Datei von unterschiedlichen
        # Prozessen zeitgleich Inhalt hinzugefügt, bzw. entfernt. Da die Funktion rekursiv aufgerufen wird, wird jedes mal überprüft, ob die nodes_export_process.json
        # noch existiert und der Inhalt per json.loads geladen werden kann.
        if (os.path.isfile(file_name_process)):
            try:
                file = open(file_name_process, 'r', encoding="utf-8")
                data = file.read()
                file.close()

                if (data == ''):
                    os.remove(file_name_process)

            except:
                self.add_log("Problem opening file " + file_name_process)
                self.add_log("exit Task")
                exit()

            try:
                text_arr = json.loads(data)

                # Ist die Länge des Arrays 0, so wurden alle Nodes abgearbeitet und die Datei kann gelöscht werden
                if (len(text_arr) == 0):
                    self.add_log('File ' + file_name_process +
                                 ' is empty. Delete file.')
                    os.remove(file_name_process)
                    text_arr = None
            except:
                self.add_log("Cant convert data from " + file_name_process +
                             ' into json dict')

        # Wurde kein Json bis hier geladen oder die Datei nodes_export_process.json existiert nicht mehr, so wird die Datei nodes_export.json
        # versucht zu öffnen
        if (text_arr == None):

            file_name_default = os.path.join(file_path,
                                             'export/nodes_export.json')

            try:
                file = open(file_name_default, 'r', encoding="utf-8")
                data = file.read()
                json_arr = json.loads(data)

                # Ist diese Datei leer, weil aktuell keine Nodes verarbeitet werden müssen, wird die Anwendung beendet
                if (len(json_arr) == 0):
                    self.add_log("No input for processing in " +
                                 file_name_default)
                    self.add_log("exit Task")
                    exit()
            except:
                self.add_log("No input for processing in " + file_name_default)
                self.add_log("exit Task")
                exit()

            try:
                # Sind Nodes in der Datei, dann wird diese in nodes_export_process.json kopiert, mit der dann beim nächsten rekursiven Aufruf
                # weiter gearbeitet wird
                shutil.copy2(file_name_default, file_name_process)

                self.add_log("Copy file " + file_name_default + " to " +
                             file_name_process)

                # Da die Datei kopiert wurde, kann die Ausgangsdatei nun geleert und gespeichert werden.
                file = open(file_name_default, "w")
                file.write("{}")
                file.close()

                # Inhalt aus der neuen Datei auslesen und in data speichern
                file = open(file_name_process, 'r', encoding="utf-8")
                data = file.read()
                file.close()
            except:
                self.add_log("Problem opening file " + file_name_process)
                self.add_log("exit Task")
                exit()

            # Versuchen json zu laden. Schlägt dies fehl, soll die gesamte Anwendung beendet werden, da es keine Daten zum Verarbeiten gibt
            try:
                text_arr = json.loads(data)
            except:
                self.add_log("Cant convert data from " + file_name_process +
                             ' into json dict')
                self.add_log("exit Task")
                exit()

        # Fehlgeschlagene Nodes werden in einer Datei gespeichert. Damit weitere fehlgeschlagene Nodes hinten angehängt werden können, die Datei zunächst erst einmal laden.

        failed_nodes_name = os.path.join(file_path, 'export/nodes_failed.json')
        failed_nodes_arr = None

        if (os.path.isfile(failed_nodes_name)):
            f = open(failed_nodes_name, "r", encoding="utf-8")
            data = f.read()
            f.close()

            if (data == ''):
                failed_nodes_arr = {}
            else:
                try:
                    failed_nodes_arr = json.loads(data)
                except:
                    failed_nodes_arr = {}
        else:
            failed_nodes_arr = {}

        # Durch den rekursiven Aufruf der Funktion wird pro Aufruf eine Node abgearbeitet. Dafür zunächst den obersten Content Type aus dem Array laden
        # In dem mehrdimensionalen Array vom Content Type als nächste die Node ID holen und die dazugehörigen Values
        content_type = next(iter(text_arr))
        content_type_values = next(iter(text_arr.values()))
        node_id = next(iter(content_type_values))
        node_values = next(iter(content_type_values.values()))

        title = node_values['title']
        created = node_values['created']
        changed = node_values['changed']

        self.add_log("Remaining Nodes: " +
                     str(len(content_type_values.keys())))
        self.add_log("Node ID: " + node_id + "; Title: " + title)

        print("Remaining Nodes: " + str(len(content_type_values.keys())))
        print(node_id)

        # Versuchen die bisherige Node in Neo4j zu löschen. Dabei werden nicht Entitäten und Synonyme gelöscht, nur die Root Node
        # Content Fields, Sentence, Tags und die Relationen dazwischen. Beim erneuten indexieren einer Node und eventuellen Veränderungen
        # ist es einfacher den Baum, den die Node mit ihren Content Fields, Sentences etc. aufspannt einmal komplett zu entfernen.
        tries = 3
        for i in range(tries):
            try:
                self.driver.del_node(node_id)
            except Exception as e:
                if (type(e).__name__ == "ServiceUnavailable"
                        and i < tries - 1):

                    self.add_log(str(e))
                    self.add_log("Retry")
                    continue
                else:
                    raise
            break

        # Alle Felder der aktuellen Node aus dem Array iterieren, versuchen die Informationen mit CoreNLP zu extrahieren und anschließend in Neo4j zu speichern.
        for field, content in node_values['fields'].items():

            # Manche Felder haben mehrere Inhalte (beispielsweise beim Feld Siblings wäre jedes aufgeführte Geschwisterkind ein eigener Inhalt) und Drupal gibt zu jedem
            # Feld ein Array mit den unterschiedlichen Inhalten zurück.
            for text in content:

                # Versuchen die Informationen zu extrahieren. Das Einfügen in Neo4j ergibt nur Sinn, wenn dieser Prozess erfolgreich war. Andernfalls wird die Node mit
                # den dazugehörigen Values in nodes_failed mit aufgenommen
                extract_success = False
                try:
                    extract_dict = self.extractInformations(text)
                    extract_success = True
                except RuntimeError as e:
                    if (content_type not in failed_nodes_arr):
                        failed_nodes_arr[content_type] = {}
                    failed_nodes_arr[content_type][node_id] = node_values
                    self.add_log(
                        "Problem occured during extraction. Maybe restart stanford core nlp. Message: "
                        + str(e))
                    print('runtimereror')
                except Exception as e:

                    if (content_type not in failed_nodes_arr):
                        failed_nodes_arr[content_type] = {}
                    failed_nodes_arr[content_type][node_id] = node_values

                    self.add_log(
                        "Problem occured during extraction. Maybe restart stanford core nlp. Message: "
                        + str(e))
                    print('generic error')

                # War das Extrahieren der Informationen erfolgreich, soll das Ergebnis in Neo4j abgespeichert werden. Auch hier gilt, wenn das
                # Abspeichern nicht möglich ist, wird die Node mit den Values in nodes_failed mit aufgenommen
                if (extract_success):
                    tries = 3
                    for i in range(tries):
                        try:

                            self.add_log("Insert field " + field +
                                         " with content in database")

                            print(
                                self.driver.create_root_node(
                                    extract_dict, node_id, content_type, field,
                                    title, created, changed).data())
                            #self.driver.create_root_node(extract_dict, node_id, content_type, field, title, created, changed)
                        except Exception as e:

                            if (type(e).__name__ == "ServiceUnavailable"
                                    and i < tries - 1):
                                self.add_log(str(e))
                                self.add_log("Retry")
                                continue
                            else:
                                if (content_type not in failed_nodes_arr):
                                    failed_nodes_arr[content_type] = {}
                                failed_nodes_arr[content_type][
                                    node_id] = node_values

                                self.add_log(
                                    "Problem occured during save. Maybe restart neo4j service. Message: "
                                    + str(e))

                        break

        # Egal ob das Extrahieren und Abspeichern erfolgreich war, soll im Anschluss die Node mit den Values aus dem Array entfernt werden, damit beim nächsten rekursiven
        # Aufruf die nächste Node mit ihren Values abgearbeitet werden kann
        del (text_arr[content_type][node_id])

        # Hat der Content Type keine Nodes mehr, so soll dieser auch entfernt werden, damit ggf. mit dem nächsten Content Type und seinen Nodes beim nächsten rekursiven
        # Aufruf fortgefahren werden kann.
        if (len(text_arr[content_type].keys()) == 0):
            del (text_arr[content_type])

        # Das Array mit den Content Types und Nodes wieder abspeichern
        file = open(file_name_process, "w", encoding="utf-8")
        file.write(json.dumps(text_arr))
        file.close()

        # Das Array mit den fehlgeschlagenen Nodes ebenfalls abspeichern
        file = open(failed_nodes_name, "w", encoding="utf-8")
        file.write(json.dumps(failed_nodes_arr))
        file.close()

        # Ist die Länge vom Array 0, wurden alle Content Types und Nodes abgearbeitet. Danach den Suchindex neu erstellen und manuell hinzugefügte Entitäten
        # zu den Bäumen in der Datenbank hinzufügen.
        if (len(text_arr) == 0):

            # Der Suchindex erleichtert das Durchsuchen der Sätze und Teilsätze für die semantische Ähnlichkeit. Werden jedes mal alle Sätze aus der Datenbank geladen
            # und auf semantische Ähnlichkeit überprüft werden, dauert ein Aufruf mehr als 10 Sekunden, da es tausende von Sätzen sind. Beim Suchindex werden die Vektoren der Sätze so im Suchindex abgespeichert,
            # sodass durch nearest neighbor search die ähnlichen Sätze gefunden werden können. Dadurh verringert sich die Zeit auf ms. Der Suchindex kann aber nicht aktualisiert werden
            # und muss daher jedes mal neu erstellt werden.
            result = self.driver.get_all_sent_clauses()

            if (len(result) > 0):

                self.add_log('Creating search index')
                ann = AnnoyIndex(300)

                for res in result:

                    nlp_res = self.nlp(res['shorten_original'].lower())
                    ann.add_item(int(res['sen_id']), nlp_res.vector)

                    counter = 0
                    for clause in res['shorten_clauses']:

                        clause_count = clause[1]
                        nlp_clause = self.nlp(clause[0].lower())
                        ann.add_item(int(clause_count), nlp_clause.vector)
                        counter += 1

                ann.build(10)
                ann.save('search_index.ann')

                # Die manuell hinzugefügten Entitäten stammen von Drupal und wurden nach dem Anlegen in eine Datei abgespeichert, die
                # von dieser Anwendung verarbeitet werden kann. Da beim erneuten Indexieren die Node mit ihren Unterknoten aus Neo4j gelöscht wird und somit keine Verbindung mehr mit
                # manuell hinzugefügten Entitäten besteht wird im Anschluss geschaut in welchen Sätzen der Nodes die entsprechenden Entitäten vorkommen und verknüpft diese mit den Sätzen.

                self.add_log('Adding manual created nodes')

                manually_entities = None
                try:
                    changed_entities = os.path.join(file_path,
                                                    'changed_entities.json')
                    file = open(changed_entities, 'r', encoding="utf-8")
                    data = file.read()
                    changed_entities = json.loads(data)
                    manually_entities = changed_entities['added_entities']
                except:
                    pass

                if (manually_entities != None):
                    for ent in manually_entities:
                        self.driver.add_entity(ent, manually_entities[ent])
                        print('manuelaly nodes')

        # Rekursiver Aufruf der Funktion
        self.start_extraction()
Ejemplo n.º 46
0
from annoy import AnnoyIndex
import rocksdb
import numpy as np
import io
import json

db = rocksdb.DB("fastText.db", rocksdb.Options(create_if_missing=False))
emojiDB = rocksdb.DB("emojiFastText.db", rocksdb.Options(create_if_missing=False))

with io.open('emojiData.json', encoding='utf8') as f:
    data = json.load(f)

f = 300
t = AnnoyIndex(f, metric='angular')

for i, e in enumerate(data):
    j = e["emoji"]
    X = np.frombuffer(emojiDB.get(j.encode()))
    t.add_item(i, X)

t.build(100)
t.save('emojis.ann')

Ejemplo n.º 47
0
class EditVectorCombinedDistanceSolver(VectorDistanceSolver):
    """
    A simple baseline model for doing OOV translation that takes
    the translation of an OOV word to be the translation of the
    in-vocabulary word with the highest interpolation of
    vector similarity + edit similarity.

    We find the word with the highest similarity in the source
    vocabulary, and pick its most likely translation (according to the
    t-table) as our predicted translation.

    We take advantage of the FastText package from Facebook to easily
    generate vectors for unknown words.
    """
    @overrides
    def __init__(self):
        super(EditVectorCombinedDistanceSolver, self).__init__()

        # We don't use self.foreign_vectors, delete to avoid bugs
        del self.foreign_vectors

        self.int_to_foreign = None
        self.annoy_index = None
        self.annoy_index_path = None
        self.vector_dim = None

    @overrides
    def get_state_dict(self):
        state_dict = {
            "solver_class": self.__class__,
            "solver_init_params": self.solver_init_params,
            "fasttext_model_path": self.fasttext_model_path,
            "foreign_to_english": self.foreign_to_english,
            "int_to_foreign": self.int_to_foreign,
            "annoy_index_path": self.annoy_index_path,
            "vector_dim": self.vector_dim
        }
        return state_dict

    @overrides
    def load_from_state_dict(self, state_dict):
        self.fasttext_model_path = state_dict["fasttext_model_path"]
        self.foreign_to_english = state_dict["foreign_to_english"]
        self.int_to_foreign = state_dict["int_to_foreign"]
        self.annoy_index_path = state_dict["annoy_index_path"]
        self.vector_dim = state_dict["vector_dim"]

        self.annoy_index = AnnoyIndex(self.vector_dim)
        self.annoy_index.load(self.annoy_index_path)

        self.was_loaded = True
        return self

    @overrides
    def save_to_file(self, save_dir, run_id):
        save_path = os.path.join(save_dir, run_id + "_model.pkl")
        # Move the fastText model we used to the save path
        logger.info("Copying fastText model from {} to "
                    "save dir at {}".format(self.fasttext_model_path,
                                            save_dir))
        shutil.copy(self.fasttext_model_path, save_dir)
        # Now edit the model path to point to file we wrote
        self.fasttext_model_path = os.path.join(
            save_dir, os.path.basename(self.fasttext_model_path))

        # Save the annoy index to the save path
        logger.info("Saving annoy index to save dir at {}".format(save_dir))
        self.annoy_index_path = os.path.join(save_dir,
                                             run_id + "_annoy_index.ann")
        self.annoy_index.save(self.annoy_index_path)

        state_dict = self.get_state_dict()
        torch.save(state_dict, save_path, pickle_module=dill)

    @overrides
    def train_model(self,
                    foreign_vectors,
                    foreign_to_english,
                    num_trees=500,
                    log_dir=None,
                    save_dir=None,
                    run_id=None):
        if self.was_loaded:
            raise ValueError(
                "EditVectorCombinedDistanceSolver does not support "
                "training from a saved model.")
        # This model has no parameters to optimize
        self.foreign_to_english = foreign_to_english

        # Use FastText to generate vectors for tokens in the
        # foreign_to_english dictionary that aren't in foreign_vectors.
        logger.info("Using FastText to make vectors for tokens that are in "
                    "our foreign to english dictionary, but not in the set "
                    "of pretrained vectors.")
        uncovered_foreign_tokens = [
            tok for tok in self.foreign_to_english
            if tok not in foreign_vectors
        ]
        uncovered_tokens_to_vectors = generate_fasttext_vectors_from_list(
            fasttext_binary_path=self.fasttext_bin_path,
            fasttext_model_path=self.fasttext_model_path,
            input_words=uncovered_foreign_tokens)

        # Add these vectors to the foreign_vectors dict
        for token, vector in uncovered_tokens_to_vectors.items():
            if self.vector_dim is None:
                self.vector_dim = len(vector)
            else:
                assert self.vector_dim == len(vector)
            foreign_vectors[token] = vector

        # Prune the foreign_vectors_dict until the set of foreign tokens in
        # our foreign to english dict is the same as the set of vectors we have
        pruned_foreign_vectors_dict = {
            k: v
            for k, v in foreign_vectors.items() if k in self.foreign_to_english
        }
        self.int_to_foreign = {
            k: v
            for k, v in enumerate(pruned_foreign_vectors_dict.keys())
        }

        # Build the annoy index
        logger.info("Building annoy index with {} trees".format(num_trees))
        self.annoy_index = AnnoyIndex(self.vector_dim)
        num_added = 0
        for index, foreign in self.int_to_foreign.items():
            # If we don't have translations for a foreign word, we don't
            # want to propose that as the source for a translation.
            if foreign not in self.foreign_to_english:
                continue
            vector = foreign_vectors[foreign]
            self.annoy_index.add_item(index, vector)
            num_added += 1
        self.annoy_index.build(num_trees)
        assert self.annoy_index.get_n_items() == len(self.foreign_to_english)

        if save_dir is not None and run_id is not None:
            logger.info("Saving trained model to save dir {} with run "
                        "id {}".format(save_dir, run_id))
            self.save_to_file(save_dir=save_dir, run_id=run_id)

    @overrides
    def translate_list(self,
                       oov_list,
                       show_progbar=True,
                       n_jobs=1,
                       debug=False):
        # Get vectors for all of the uncovered_oovs
        oov_vectors = generate_fasttext_vectors_from_list(
            fasttext_binary_path=self.fasttext_bin_path,
            fasttext_model_path=self.fasttext_model_path,
            input_words=oov_list)

        oov_token_candidates_list = []
        num_to_pick = int(math.ceil(0.2 * len(self.foreign_to_english)))
        logger.info("Using annoy to find top {} nearest "
                    "neighbors for each token".format(num_to_pick))
        # Use Annoy to find the top nearest neighbors for each oov token.
        for oov_token in oov_list:
            oov_vector = oov_vectors[oov_token]
            # Find the top 5% of nearest neighbors (in vector space) with the
            # oov token's vector. This tries to find words that are semantically
            # similar.
            nn_indices = self.annoy_index.get_nns_by_vector(
                oov_vector, num_to_pick, search_k=-1, include_distances=False)

            # Get the foreign words corresponding to the found nearest neighbors
            # These are the candidates we will use in the edit distance translation
            candidate_foreigns = [
                self.int_to_foreign[index] for index in nn_indices
            ]
            oov_token_candidates_list.append((oov_token, candidate_foreigns))

        if n_jobs > 1:
            # Since we can't pickle self.annoy_index, set it to a local variable
            # and then delete it.
            annoy_index = self.annoy_index
            del self.annoy_index
            logger.info("Translating with {} processes".format(n_jobs))
            pool = multiprocessing.Pool(processes=n_jobs)
            if six.PY2:
                # Create a multiprocess pool with the _get_nearest_neighbor alias.
                # This is not used in python 3 because there's overhead in passing
                # the object back and forth.
                _bound_get_nearest_neighbor_mp_alias = functools.partial(
                    _get_nearest_neighbor_mp_alias, self)
                closest_source_tokens = pool.map(
                    _bound_get_nearest_neighbor_mp_alias,
                    oov_token_candidates_list)
            else:
                closest_source_tokens = pool.map(self._get_nearest_neighbor,
                                                 oov_token_candidates_list)
            # Restore self.annoy_index
            self.annoy_index = annoy_index
        else:
            if show_progbar:
                oov_iterable = tqdm(oov_token_candidates_list)
            else:
                oov_iterable = oov_token_candidates_list
            closest_source_tokens = [
                self._get_nearest_neighbor(oov_token_vector_tuple)
                for oov_token_vector_tuple in oov_iterable
            ]
        predicted_translations = []
        for source_token in closest_source_tokens:
            english_translations = self.foreign_to_english[source_token]
            predicted_translation = max(english_translations.keys(),
                                        key=lambda k: english_translations[k])
            predicted_translations.append(predicted_translation)
        return predicted_translations

    def _get_nearest_neighbor(self, oov_token_candidates_tuple):
        """
        Given a single OOV token, find the best English translation.

        Parameters
        ----------
        oov_token_candidates_tuple: tuple of (str, List[str])
            Tuple of (oov_token, candidates). The oov token is the string
            to predict a translation for. Candidates are the words
            we can choose among as potential source words for translation.
        """
        oov_token, foreign_candidates = oov_token_candidates_tuple

        # Out of the candidates, pick the one with the highest
        # edit similarity.
        def calculate_edit_similarity_with_input_oov(x):
            if len(x) == 0:
                return 0
            longest_common_prefix_len = len(
                os.path.commonprefix([x, oov_token]))
            edit_distance = int(editdistance.eval(x, oov_token))
            score = (0.75 * (1 -
                             (edit_distance / max(len(x), len(oov_token)))) +
                     0.25 *
                     (longest_common_prefix_len / min(len(x), len(oov_token))))
            return score

        most_similar_source_token = max(
            foreign_candidates, key=calculate_edit_similarity_with_input_oov)
        return most_similar_source_token
Ejemplo n.º 48
0
class AnnoyIndexer(BaseVectorIndexer):
    lock_work_dir = True

    def __init__(self,
                 num_dim: int,
                 data_path: str,
                 metric: str = 'angular',
                 n_trees=10,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.num_dim = num_dim
        self.work_dir = data_path
        self.indexer_file_path = os.path.join(self.work_dir,
                                              self.internal_index_path)
        self.metric = metric
        self.n_trees = n_trees
        self._key_info_indexer = ListKeyIndexer()

    def post_init(self):
        from annoy import AnnoyIndex
        self._index = AnnoyIndex(self.num_dim, self.metric)
        try:
            self._index.load(self.indexer_file_path)
        except:
            self.logger.warning(
                'fail to load model from %s, will create an empty one' %
                self.indexer_file_path)

    def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray,
            weights: List[float], *args, **kwargs):
        last_idx = self._key_info_indexer.size

        if len(vectors) != len(keys):
            raise ValueError('vectors length should be equal to doc_ids')

        if vectors.dtype != np.float32:
            raise ValueError("vectors should be ndarray of float32")

        for idx, vec in enumerate(vectors):
            self._index.add_item(last_idx + idx, vec)

        self._key_info_indexer.add(keys, weights)

    def query(self, keys: 'np.ndarray', top_k: int, *args,
              **kwargs) -> List[List[Tuple]]:
        self._index.build(self.n_trees)
        if keys.dtype != np.float32:
            raise ValueError('vectors should be ndarray of float32')
        res = []
        for k in keys:
            ret, relevance_score = self._index.get_nns_by_vector(
                k, top_k, include_distances=True)
            chunk_info = self._key_info_indexer.query(ret)
            res.append([(*r, -s) for r, s in zip(chunk_info, relevance_score)])
        return res

    @property
    def size(self):
        return self._index.get_n_items()

    def __getstate__(self):
        d = super().__getstate__()
        self._index.save(self.indexer_file_path)
        return d
Ejemplo n.º 49
0
class realtimeTrain:
    def __init__(self):
        self.storage = path.join(getcwd(), 'storage')
        self.db = Database()
        self.faces = []  # storage all faces in caches array of face object
        self.known_encoding_faces = []  # faces data for recognition
        self.face_user_keys = {}
        self.load_all()

    def load_user_by_index_key(self, index_key=0):

        key_str = str(index_key)

        if key_str in self.face_user_keys:
            return self.face_user_keys[key_str]

        return None

    def load_train_file_by_name(self, name):
        trained_storage = path.join(self.storage, 'trained')
        return path.join(trained_storage, name)

    def load_unknown_file_by_name(self, name):
        unknown_storage = path.join(self.storage, 'unknown')
        return path.join(unknown_storage, name)

    def load_all(self):

        results = self.db.select(
            'SELECT faces.id, faces.user_id, faces.filename, faces.created FROM faces'
        )
        self.layer_size = 0
        count = 0
        for row in results:

            user_id = row[1]
            filename = row[2]
            print('train::', user_id)
            face = {
                "id": row[0],
                "user_id": user_id,
                "filename": filename,
                "created": row[3]
            }
            self.faces.append(face)

            face_image = face_recognition_api.load_image_file(
                self.load_train_file_by_name(filename))
            face_image_encoding = face_recognition_api.face_encodings(
                face_image)[0]
            index_key = len(self.known_encoding_faces)
            self.known_encoding_faces.append(face_image_encoding)
            index_key_string = str(index_key)
            self.face_user_keys['{0}'.format(index_key_string)] = user_id
            if count == 0:
                self.layer_size = len(face_image_encoding)
                self.tree = AnnoyIndex(self.layer_size,
                                       metric)  # prepare index
            self.tree.add_item(user_id, face_image_encoding)
            count += 1
        print 'building index...\n'
        if self.layer_size > 0:
            print 'layer_size=', self.layer_size
            self.tree.build(ntrees)
            self.tree.save('index.ann')
Ejemplo n.º 50
0
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.models import Model
from annoy import AnnoyIndex

# img_dir_path = 'dataset/All/'
img_dir_path = 'dataDrivenArt/bin/data/images/'
annoy_model_path = 'model/x-fresh-fc2.ann'
annoy_dim = 4096

base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input,
              outputs=base_model.get_layer('fc2').output)

annoy_model = AnnoyIndex(annoy_dim)

for i in range(1, 3988):
    img_path = img_dir_path + str(i) + '.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    fc2_features = model.predict(x)

    annoy_model.add_item(i, fc2_features[0])
    print(img_path, 'saved')

annoy_model.build(3987)
annoy_model.save(annoy_model_path)
Ejemplo n.º 51
0
class QA_process():
    def __init__(self):
        self.baiduzhidao = Baiduzhidao_spider()
        load_file = open('./mod/zhishi_entity.bin', 'rb')
        self.zhishi_entity = pickle.load(load_file)
        self.bc = BertClient(ip='192.168.1.101', ignore_all_checks=True)
        self.annoyIndex = AnnoyIndex(768)
        self.annoyIndex.load('./mod/qa_index.mod')

        load_file = open('./mod/qs_dict.bin', 'rb')
        self.qa_dict = pickle.load(load_file)

    def getZhishi(self, entity):
        if self.zhishi_entity.get(entity):
            logging.info('find %s from zhishi_entity' % entity)
            return self.zhishi_entity.get(entity)

        url = 'http://zhishi.me/api/entity/%s?property=infobox'
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
            'Accept':
            'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8',
            'Accept-Language': 'zh-CN, zh;q = 0.9'
        }
        try:
            wb_data = requests.get(url % entity,
                                   headers=headers,
                                   allow_redirects=True)
            wb_data.encoding = 'utf-8'

            content = wb_data.json()
            logging.info('request ' + str(content))
            result = jsonpath.jsonpath(content, "$..'infobox'")[0]
            self.zhishi_entity[entity] = result
            fou = open('./mod/zhishi_entity.bin', 'wb')
            pickle.dump(self.zhishi_entity, fou)
            fou.close()
            return result

        except:
            logging.error('未成功取得' + entity + '属性')
            return {}

    def getAllQA(self, theme):
        #得到主题的属性
        theme_json = self.getZhishi(theme)
        logging.info(str(theme_json))
        logging.info(str(theme_json.keys()))
        properties = theme_json.keys()
        properties = [p[:-1] for p in properties]
        qa_pairs = self.baiduzhidao.getQA(theme, 5)
        for p in properties:
            qa_pairs.extend(self.baiduzhidao.getQA(theme + '%20' + p, 5))
        fou = open('./mod/qa_pairs.bin', 'wb')
        pickle.dump(qa_pairs, fou)
        fou.close()

    def get_sim(self, something):
        url = 'http://10.122.141.12:9006/similar'
        r = requests.post(url,
                          json={
                              "ck": "synonym",
                              "synonym_word": something,
                              "synonym_selectedMode": "auto",
                              "homoionym_word": "",
                              "homoionym_selectedMode": "auto",
                              "homoionym_num": ""
                          })
        json = r.json()
        result = json['detail']['res']['synonym']
        return result

    def selectQA(self, qa_pairs, theme):
        theme_json = self.zhishi_entity.get(theme)
        # properties = theme_json.keys()
        # properties = [p[:-1] for p in properties]
        # qs=defaultdict(set)
        qs = dict()
        #有同义词问题
        samenames = ['故宫']
        samenames.append(theme)
        for qa in qa_pairs:
            for samename in samenames:
                if samename in qa[0]:
                    #要包含主题词,这是触发词
                    # t_p=qa[6].split('%20')
                    # if(len(t_p)==2):
                    #     p=t_p[1]
                    #     p_val=theme_json.get(p+':')
                    # qs[qa[6]].add(qa[0])
                    #直接保存吧
                    qs[qa[0]] = qa
        fou = open('./mod/qs_dict.bin', 'wb')
        pickle.dump(qs, fou)
        fou.close()

        self.save_qa_vec(qs)

    def save_qa_vec(self, qs):
        #保存到问题句向量中
        q_arr = [q for q in qs]

        encodes = self.bc.encode(q_arr)
        for i, encode in enumerate(encodes):
            self.annoyIndex.add_item(i, encode)
        self.annoyIndex.build(10)
        self.annoyIndex.save('./mod/qa_index.mod')

    def anwser(self, q):
        encode = self.bc.encode([q])[0]
        restult, distance = self.annoyIndex.get_nns_by_vector(
            encode, 1, include_distances=True)
        answer_arr = [self.qa_dict.get(q) for q in self.qa_dict]
        quest_arr = [q for q in self.qa_dict]
        if np.cos(distance) > 0.8:
            logging.info(str(np.cos(distance)) + quest_arr[restult[0]])
            logging.info(str(answer_arr[restult[0]][5]))
        else:
            logging.info('不知道')
            logging.info(str(np.cos(distance)) + quest_arr[restult[0]])
            logging.info(str(answer_arr[restult[0]][5]))

    def test(self):
        # self.getAllQA('故宫博物院')
        # load_file = open('./mod/qa_pairs.bin', 'rb')
        # qa_pairs = pickle.load(load_file)
        # logging.info('qa_pairs size:%d'%len(qa_pairs))
        # self.selectQA(qa_pairs,'故宫博物院')

        # qs=set(q[0] for q in qa_pairs)
        # logging.info('qs size:%d' % len(qs))
        # sorted(maybe_errors, key=lambda k: k[1], reverse=False)

        # qs=sorted(qa_pairs,key=lambda k:int(k[4]),reverse=True)
        # logging.info(str(qs[:2]))
        q = '千里江山图?'
        self.anwser(q)
Ejemplo n.º 52
0
class Annoy(VectorIndex):
    def __init__(self, path, dims=None, metric='angular', build_on_disk=True):
        self.path = path
        self.is_mutable = None
        self.is_built = None
        self.build_on_disk = build_on_disk
        self.metric = metric

        if os.path.isfile(self.path):
            logging.debug(f'Loading existing index: {self.path}')
            self.load_meta()
            assert self.dims == dims or not dims, \
              'Passed path to existing index but dims do not match'
            assert self.metric == metric or not metric, \
              'Passed path to existing index but metrics do not match'
            self.index = AnnoyIndex(self.dims, metric=self.metric)
        elif dims:
            logging.debug(
                f'Creating new index with {dims} dimensions and {self.metric} metric'
            )
            self.dims = dims
            self.index = AnnoyIndex(self.dims, metric=self.metric)
            if build_on_disk:
                self.index.on_disk_build(self.path)
        else:
            logging.debug(f'Loading existing index: {self.path}')
            self.load_meta()
            self.index = AnnoyIndex(self.dims, metric=self.metric)

    @property
    def meta_path(self):
        return self.path + '.meta.json'

    @property
    def files(self):
        return [self.path, self.meta_path]

    def load_meta(self):
        self.__dict__.update(load_json(self.meta_path))

    def save_meta(self):
        d = {**self.__dict__}
        d.pop('index')
        save_json(d, self.meta_path)

    def build(self, num_trees=10):
        logging.debug(f'staring to build index: {self.path}')
        self.index.build(num_trees)
        logging.debug(f'finished building index: {self.path}')
        self.is_mutable = False
        self.is_built = True
        self.save_meta()

    def save(self):
        self.index.save(self.path)
        self.is_mutable = False
        self.save_meta()

    def load(self, memory=False):
        self.index.load(self.path, prefault=memory)
        self.is_mutable = False

    def unload(self):
        self.index.unload()

    def __del__(self):
        self.unload()

    def __setitem__(self, idx, vector):
        self.index.add_item(idx, vector)

    def __getitem__(self, idx):
        return self.index.get_item_vector(idx)

    def __len__(self):
        return self.index.get_n_items()

    def add(self, vector):
        idx = len(self)
        self[idx] = vector
        return idx

    def add_bulk(self, vectors):
        start = len(self)
        for n, v in enumerate(vectors):
            self[start + n] = v
        return self

    def set_bulk(self, indices, vectors):
        for idx, vector in zip(indices, vectors):
            self[idx] = vector

    def search(self, vector, num=10, depth=None, distances=True):
        return self.index.get_nns_by_vector(vector, num, depth or -1,
                                            distances)

    def search_index(self, idx, num=10, depth=None, distances=True):
        return self.index.get_nns_by_item(idx, num, depth or -1, distances)

    def distance(self, i, j):
        return self.index.get_distance(i, j)
Ejemplo n.º 53
0
        else:
            relations[w].add(rel)
for line in open('predicates_fw.tsv').readlines():
    line = line.strip().lower().split('\t')
    rel = line[0]
    label = [x for x in ' '.join(line[1:]).split(' ') if x not in stop]
    for w in label:
        if w not in relations:
            relations[w] = set([])
        else:
            relations[w].add(rel)
all_relation_words = set([])
all_relation_words.update(relations.keys())
word2vec_pretrain_embed = gensim.models.Word2Vec.load_word2vec_format(
    '/dccstor/cssblr/amrita/resources/glove/GoogleNews-vectors-negative300.bin',
    binary=True)
f = 300
index = AnnoyIndex(f, metric='euclidean')
index_desc = {}
count = 0
for word in all_relation_words:
    word = word
    if word in word2vec_pretrain_embed:
        embed = word2vec_pretrain_embed[word]
        index.add_item(count, embed)
        index_desc[count] = word
        count = count + 1
index.build(100)
index.save('annoy_index_noisy/glove_embedding_of_vocab.ann')
pkl.dump(index_desc, open('annoy_index_noisy/index2word.pkl', 'wb'))
Ejemplo n.º 54
0
item_vectors = movielens['item_features'] * model.item_embeddings

# Now let's make an annoy index for item to item querying:

# In[93]:

from annoy import AnnoyIndex

f = item_vectors.shape[1]  # Length of item vector that will be indexed
t = AnnoyIndex(f)
for i in range(item_vectors.shape[0]):
    v = item_vectors[i]
    t.add_item(i, v)

t.build(10)  # 10 trees
t.save('movielens_item_Annoy_idx.ann')

# And query the index for similar movies:

# In[94]:


def nearest_movies_Annoy(movie_id, index, n=10, print_output=True):
    nn = index.get_nns_by_item(movie_id, 10)
    if print_output == True:
        print('Closest to %s : \n' % movielens['item_labels'][movie_id])
    titles = [movielens['item_labels'][i] for i in nn]
    if print_output == True:
        print("\n".join(titles))

Ejemplo n.º 55
0
def create_annoy(target_features):
    t = AnnoyIndex(layer_dimension)
    for idx, target_feature in enumerate(target_features):
        t.add_item(idx, target_feature)
    t.build(10)
    t.save(os.path.join(work_dir, 'annoy.ann'))
Ejemplo n.º 56
0
class SearchIndex():
    """The search index manages search indexes on disk

    This support creating indexes and operations to save/load to/from disk
    """
    def __init__(self):
        """Generates a new SearchIndex, used in Server Class

        The main purpose of this class is to generate an index, without
        the Server class needs to know the search index it is being used

        A search index is ready to be used when an index exists
        and it isready (when an index has been built).
        """
        self.index = None
        self.ready = False

    def build_from_trained_model(self, trained_model, depth):
        """Creates an index from a trained model

        :param TrainedModel trained_model: The trained model
        :param int depth: The depth desired to generate the search index
        """
        entities_matrix = trained_model.E
        nrows, emb_size = entities_matrix.shape

        self.index = AnnoyIndex(emb_size)

        # Populate the search index with the trained embedding
        for row in range(0, nrows):
            vector = list(entities_matrix[row])
            self.index.add_item(row, vector)

        # Generate the index itself. This may take long time
        self.index.build(depth)

        # Index ready
        self.ready = True

    def save_to_binary(self, filepath):
        """Dump the search tree on a file on disk

        :param string filepath: The path where the file will be saved
        :return: If operations had or not errors
        :rtype: boolean
        """
        if self.index is None or self.ready is False:
            print("The index is not ready to be saved")
            return False

        self.index.save(filepath)
        return True

    def load_from_file(self, filepath, emb_size):
        """Load the search tree from a file on disk

        :param string filepath: The path where the file will be saved
        :param int emb_size: The size of embedding vector used
        :return: If operations had or not errors
        :rtype: boolean
        """
        self.index = AnnoyIndex(emb_size)
        self.index.load(filepath)
        self.ready = True
Ejemplo n.º 57
0
logging.info('building index for %s' % (EMB_DIR % corpus))

aidx = AnnoyIndex(DIMENSIONS)
for f in os.listdir(EMB_DIR % corpus):

    logging.debug('indexing %s' % f)

    with open('%s/%s' % (EMB_DIR % corpus, f)) as embf:
        data = json.loads(embf.read())
        # print(repr(data))

        i = int(f.replace('.json', ''))

        aidx.add_item(i, data['emb'])

# for i in xrange(1000):
#     v = [random.gauss(0, 1) for z in xrange(f)]

logging.info('building %d trees' % NUM_TREES)
aidx.build(NUM_TREES)

aidx.save(INDEX_FN % corpus)
logging.debug('%s written.' % (INDEX_FN % corpus))

# # test index
#
# u = AnnoyIndex(f)
# u.load(INDEX_FN) # super fast, will just mmap the file
# print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors
Ejemplo n.º 58
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_batch = args.batch(tokenizer, args.max_lens)
    train_batch.set_data(utils.read_lines(args.path['train_x']),
                         utils.read_lines(args.path['train_y']))
    dev_batch = args.batch(tokenizer, args.max_lens)
    dev_batch.set_data(utils.read_lines(args.path['dev_x']),
                       utils.read_lines(args.path['dev_y']))
    model = args.model(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(pad_step_number=True)
        recorder = Recorder()
        starter = time.time()

        for i in range(args.max_steps):
            input_x, input_y, idx, update_epoch = train_batch.next_batch(
                args.batch_size, recorder.train_idx)
            train_features = {
                'input_x_ph': input_x,
                'input_y_ph': input_y,
                'keep_prob_ph': args.keep_prob
            }
            recorder.train_idx = idx
            train_fetches, train_feed = model.train_step(train_features)
            _, train_loss, train_acc = sess.run(train_fetches, train_feed)
            recorder.train_losses.append(train_loss)
            recorder.train_accs.append(train_acc)

            if not i % args.show_steps and i:
                input_x, input_y, idx, update_epoch = dev_batch.next_batch(
                    args.batch_size, recorder.dev_idx)
                dev_features = {
                    'input_x_ph': input_x,
                    'input_y_ph': input_y,
                    'keep_prob_ph': 1.0
                }
                recorder.dev_idx = idx
                dev_fetches, dev_feed = model.dev_step(dev_features)
                dev_loss, dev_acc = sess.run(dev_fetches, dev_feed)
                recorder.dev_losses.append(dev_loss)
                recorder.dev_accs.append(dev_acc)
                speed = args.show_steps / (time.time() - starter)
                utils.verbose(
                    r'        step {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format(
                        i, train_loss, train_acc, dev_loss, dev_acc, speed))
                starter = time.time()

            if not i % args.save_steps and i:
                features = recorder.stats()
                if features['save']:
                    saver.save(sess, args.path['model'])
                utils.verbose(
                    r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i,
                                                  features['train_loss'],
                                                  features['train_acc'],
                                                  features['dev_loss'],
                                                  features['dev_acc']))
                print('-+' * 55)
                utils.write_result(args, recorder.lowest_loss)

        utils.verbose('Start building vector space from dual encoder model')
        vectors = []
        infer_batch = args.batch(tokenizer, args.max_lens)
        infer_batch.set_data(utils.read_lines(args.path['train_x']),
                             utils.read_lines(args.path['train_y']))
        starter = time.time()
        idx = 0
        update_epoch = False
        i = 0
        while not update_epoch:
            input_x, input_y, idx, update_epoch = infer_batch.next_batch(
                args.batch_size, idx)
            infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0}
            infer_fetches, infer_feed = model.infer_step(infer_features)
            enc_questions = sess.run(infer_fetches, infer_feed)
            vectors += enc_questions
            if not i % args.show_steps and i:
                speed = args.show_steps / (time.time() - starter)
                utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format(
                    i, speed))
                starter = time.time()
            i += 1
    vectors = np.reshape(np.array(vectors),
                         [-1, args.hidden])[:infer_batch.data_size]
    vec_dim = vectors.shape[-1]
    ann = AnnoyIndex(vec_dim)
    for n, ii in enumerate(vectors):
        ann.add_item(n, ii)
    ann.build(args.num_trees)
    ann.save(args.path['ann'])
    utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))
Ejemplo n.º 59
0
songs = []
embedding_size = 300
t = AnnoyIndex(embedding_size, 'angular')

with open('lyrics.csv', encoding="utf8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    index = 0
    for row in itertools.islice(csv_reader, 0, 380000, 10):
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
            fields = row
        else:
            lyrics = row[5]
            if line_count == 1:
                testPhrase = lyrics
            if len(lyrics) > 0:
                embedding = document_embedding(lyrics)
                t.add_item(index, embedding)
                songs.append((row[1],row[3],row[5]))
                index += 1
            line_count += 1

    print(f'Processed {line_count} lines.')

t.build(10) # 10 trees
t.save('test.ann')
song_df = DataFrame(songs, columns = ['songtitle' , 'artist', 'lyrics'])
song_store.put('/mat', song_df)
Ejemplo n.º 60
0
        self.out = tf.squeeze(
            self.sess.graph.get_tensor_by_name('vgg_16/avgp5/AvgPool:0'))

    def feat1(self, image_path):
        img_data = np.expand_dims(np.array(open(image_path, 'r').read()), 0)
        return self.sess.run(self.out, {self.img: img_data})

    def feat2(self, feat_string):
        img_data = np.expand_dims(np.array(feat_string), 0)
        return self.sess.run(self.out, {self.img: img_data})


names = np.load('data/name.npy')
if not os.path.exists('model/inshop.ann'):
    feats = np.load('data/feats.npy')
    t = AnnoyIndex(512)
    for i, a in enumerate(feats):
        t.add_item(i, a)
    t.build(200)
    t.save('model/inshop.ann')
else:
    t = AnnoyIndex(512)
    t.load('model/inshop.ann')

worker = Feature2()
str_ = open('./test.jpg', 'r').read()
feat1 = worker.feat2(str_)
feat2 = np.load('rst2.npy')
print 'Extract Feature:', t.get_nns_by_vector(feat1, 20)
print 'Serving Feature:', t.get_nns_by_vector(feat2, 20)