def test_metric_kwarg(self): # Issue 211 i = AnnoyIndex(2, metric='euclidean') i.add_item(0, [1, 0]) i.add_item(1, [9, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 8) self.assertEqual(i.f, 2)
def test_dist(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
def test_basic_conversion(self): f = 100 i = AnnoyIndex(f, 'hamming') u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) u2 = i.get_item_vector(0) v2 = i.get_item_vector(1) self.assertAlmostEqual(numpy.dot(u - u2, u - u2), 0.0) self.assertAlmostEqual(numpy.dot(v - v2, v - v2), 0.0) self.assertAlmostEqual(i.get_distance(0, 0), 0.0) self.assertAlmostEqual(i.get_distance(1, 1), 0.0) self.assertAlmostEqual(i.get_distance(0, 1), numpy.dot(u - v, u - v)) self.assertAlmostEqual(i.get_distance(1, 0), numpy.dot(u - v, u - v))
def test_dist_degen(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [1, 0]) i.add_item(1, [0, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 2.0**0.5)
def test_dist_2(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [1000, 0]) i.add_item(1, [10, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 0)
def test_dist(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), (2 * (1.0 - 2 ** -0.5))**0.5)
def t1est_dist_2(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000) i.add_item(0, [1000, 0]) i.add_item(1, [10, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 0)
def test_dist(self): f = 2 i = AnnoyIndex(f, 'manhattan') i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) i.add_item(2, [0, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 1.0) self.assertAlmostEqual(i.get_distance(1, 2), 2.0)
def test_dist_3(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [97, 0]) i.add_item(1, [42, 42]) dist = ((1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2)**0.5 self.assertAlmostEqual(i.get_distance(0, 1), dist)
def test_dist(self): f = 2 i = AnnoyIndex(f, 'manhattan') i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) i.add_item(2, [0, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 1.0) self.assertAlmostEqual(i.get_distance(1, 2), 2.0)
def test_dist_3(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [97, 0]) i.add_item(1, [42, 42]) dist = ((1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2)**0.5 self.assertAlmostEqual(i.get_distance(0, 1), dist)
def test_dist_2(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000, 0) i.add_item(0, [1000, 0]) i.add_item(1, [10, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 0)
def generate_triplets( X, n_inliers, n_outliers, n_random, distance="euclidean", weight_adj=500.0, verbose=True, ): distance_dict = { "euclidean": 0, "manhattan": 1, "angular": 2, "hamming": 3 } distance_index = distance_dict[distance] n, dim = X.shape n_extra = min(n_inliers + 50, n) tree = AnnoyIndex(dim, metric=distance) for i in range(n): tree.add_item(i, X[i, :]) tree.build(20) nbrs = np.empty((n, n_extra), dtype=np.int32) knn_distances = np.empty((n, n_extra), dtype=np.float32) for i in range(n): nbrs[i, :] = tree.get_nns_by_item(i, n_extra) for j in range(n_extra): knn_distances[i, j] = tree.get_distance(i, nbrs[i, j]) if verbose: print("found nearest neighbors") sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) # scale parameter P = find_p(knn_distances, sig, nbrs) triplets = sample_knn_triplets(P, nbrs, n_inliers, n_outliers) n_triplets = triplets.shape[0] outlier_distances = np.empty(n_triplets, dtype=np.float32) for t in range(n_triplets): outlier_distances[t] = calculate_dist(X[triplets[t, 0], :], X[triplets[t, 2], :], distance_index) weights = find_weights(triplets, P, nbrs, outlier_distances, sig) if n_random > 0: rand_triplets = sample_random_triplets(X, n_random, sig, distance_index) rand_weights = rand_triplets[:, -1] rand_triplets = rand_triplets[:, :-1].astype(np.int32) triplets = np.vstack((triplets, rand_triplets)) weights = np.hstack((weights, rand_weights)) weights[np.isnan(weights)] = 0.0 weights /= np.max(weights) weights += 0.0001 if weight_adj: if not isinstance(weight_adj, (int, float)): weight_adj = 500.0 weights = np.log(1 + weight_adj * weights) weights /= np.max(weights) return (triplets, weights)
def test_dist_3(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000, 0) i.add_item(0, [97, 0]) i.add_item(1, [42, 42]) dist = (1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2 self.assertAlmostEqual(i.get_distance(0, 1), dist)
def test_dist(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000, 0) # i.verbose(True) i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5))
def generate_triplets( X, n_inliers, n_outliers, n_random, distance="euclidean", verbose=False, weight_temp=0.5, ): distance_dict = { "euclidean": 0, "manhattan": 1, "angular": 2, "hamming": 3 } distance_index = distance_dict[distance] n, dim = X.shape n_extra = min(n_inliers + _NUM_EXTRA_KNN, n) tree = AnnoyIndex(dim, metric=distance) for i in range(n): tree.add_item(i, X[i, :]) tree.build(_NUM_TREES) nbrs = np.empty((n, n_extra), dtype=np.int32) knn_distances = np.empty((n, n_extra), dtype=np.float32) for i in range(n): nbrs[i, :] = tree.get_nns_by_item(i, n_extra) for j in range(n_extra): knn_distances[i, j] = tree.get_distance(i, nbrs[i, j]) if verbose: print("found nearest neighbors") sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) # scale parameter P = find_p(knn_distances, sig, nbrs) triplets = sample_knn_triplets(P, nbrs, n_inliers, n_outliers) n_triplets = triplets.shape[0] outlier_distances = np.empty(n_triplets, dtype=np.float32) for t in range(n_triplets): outlier_distances[t] = calculate_dist(X[triplets[t, 0], :], X[triplets[t, 2], :], distance_index) weights = find_weights(triplets, P, nbrs, outlier_distances, sig) if n_random > 0: rand_triplets = sample_random_triplets(X, n_random, sig, distance_index) rand_weights = rand_triplets[:, -1] rand_triplets = rand_triplets[:, :-1].astype(np.int32) triplets = np.vstack((triplets, rand_triplets)) weights = np.hstack((weights, _RAND_WEIGHT_SCALE * rand_weights)) weights[np.isnan(weights)] = 0.0 weights -= np.min(weights) weights = tempered_log(1.0 + weights, weight_temp) return (triplets, weights)
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'dot') for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual( dist, numpy.dot(i.get_item_vector(a), i.get_item_vector(b))) self.assertAlmostEqual(dist, i.get_distance(a, b))
def do_annoy(model, texts, tokenizer, verbose): unique_text = [] entity_idx = [] entity2same = {} for i in range(len(texts['anchor'])): if not texts['anchor'][i] in entity2same: entity2same[texts['anchor'][i]] = [] entity_idx.append(len(unique_text)) unique_text.append(texts['anchor'][i]) l = entity2same[texts['anchor'][i]] if texts['positive'][i] not in l: entity2same[texts['anchor'][i]].append(texts['positive'][i]) unique_text.append(texts['positive'][i]) print(entity2same) print(unique_text) sequences = tokenizer.texts_to_sequences(unique_text) sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) predictions = model.predict(sequences) t = AnnoyIndex( len(predictions[0]), metric='euclidean') # Length of item vector that will be indexed for i in range(len(predictions)): v = predictions[i] t.add_item(i, v) t.build(100) # 100 trees match = 0 no_match = 0 for index in entity_idx: nearest = t.get_nns_by_vector(predictions[index], 5) print(nearest) nearest_text = set([unique_text[i] for i in nearest]) expected_text = set(entity2same[unique_text[index]]) nearest_text.remove(unique_text[index]) print("query={} names = {} true_match = {}".format( unique_text[index], nearest_text, expected_text)) if verbose: print([t.get_distance(index, i) for i in nearest]) overlap = expected_text.intersection(nearest_text) print(overlap) m = len(overlap) match += m no_match += len(expected_text) - m print("match: {} no_match: {}".format(match, no_match))
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'manhattan') for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v))) self.assertAlmostEqual(dist, sum([abs(float(x)-float(y)) for x, y in zip(u, v)]))
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'manhattan') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v))) self.assertAlmostEqual(dist, sum([abs(float(x)-float(y)) for x, y in zip(u, v)]))
def t1est_dist(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000) print "creating object" i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) print "creating object" self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5)) print "done"
class KNN(object): def __init__(self, label: Label, model: Model): self.tree = AnnoyIndex(SIZE[model], "angular") self.tree.load(TREE[model][label]) def nearest_index(self, y_pred: np.ndarray) -> int: return self.tree.get_nns_by_vector(vector=y_pred.tolist(), n=1)[0] def nearest(self, y_pred: np.ndarray) -> np.ndarray: index = self.nearest_index(y_pred=y_pred) return np.asarray(self.tree.get_item_vector(index)) def distance(self, left_index: int, right_index: int) -> float: return self.tree.get_distance(left_index, right_index)
def test_dist(self): print "test_dist " f = 2 print "creating object" i = AnnoyIndex(f, 2, 64, "test_db", 100, 3048576000) print "creating object" i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) print "creating object" self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5)) print "don"
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'dot') for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, numpy.dot( i.get_item_vector(a), i.get_item_vector(b) )) self.assertEqual(dist, i.get_distance(a, b))
def test_dist(self): print "test_dist " f = 2 print "creating object" i = AnnoyIndex(f, 2, 64, "test_db", 100, 3048576000) print "creating object" i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) print "creating object" self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2**-0.5)) print "don"
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'euclidean') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) # self.assertAlmostEqual(dist, euclidean(u, v)) self.assertAlmostEqual(dist, numpy.dot(u - v, u - v) ** 0.5) self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u, v)])**0.5)
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'euclidean') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) # self.assertAlmostEqual(dist, euclidean(u, v)) self.assertAlmostEqual(dist, numpy.dot(u - v, u - v) ** 0.5) self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u, v)])**0.5)
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f) for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = i.get_item_vector(a) v = i.get_item_vector(b) u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5 v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5 # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5) # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5) self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f) for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = i.get_item_vector(a) v = i.get_item_vector(b) u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5 v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5 # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5) # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5) self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
def generate_pair(X, n_neighbors, n_MN, n_FP, distance='euclidean', verbose=True): '''Generate pairs for the dataset. ''' n, dim = X.shape # sample more neighbors than needed n_neighbors_extra = min(n_neighbors + 50, n - 1) tree = AnnoyIndex(dim, metric=distance) if _RANDOM_STATE is not None: tree.set_seed(_RANDOM_STATE) for i in range(n): tree.add_item(i, X[i, :]) tree.build(20) option = distance_to_option(distance=distance) nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32) knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32) for i in range(n): nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1) nbrs[i, :] = nbrs_[1:] for j in range(n_neighbors_extra): knn_distances[i, j] = tree.get_distance(i, nbrs[i, j]) print_verbose("Found nearest neighbor", verbose) sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) print_verbose("Calculated sigma", verbose) scaled_dist = scale_dist(knn_distances, sig, nbrs) print_verbose("Found scaled dist", verbose) pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors) if _RANDOM_STATE is None: pair_MN = sample_MN_pair(X, n_MN, option) pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP) else: pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE, option) pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors, n_FP, _RANDOM_STATE) return pair_neighbors, pair_MN, pair_FP, tree
def KNN_Annoy(X, KK): NK = KK NN, NF = X.shape if KK > NF: raise ValueError("KK should be less than 2th-dim of X") t = AnnoyIndex(NF, metric='euclidean') for i, v in enumerate(X): t.add_item(i, v) t.build(100) ind = [] val = [] for i in range(NN): closest = t.get_nns_by_item(i, NK) ind.append(closest) val.append([t.get_distance(i, j) for j in closest]) return np.array(ind), np.array(val)
def _random_nn(X): idx = AnnoyIndex(X.shape[1], 'euclidean') for i in range(X.shape[0]): idx.add_item(i, X[i]) logging.info("building an index with %d items" % X.shape[0]) idx.build(50) logging.info("finding %d neighbor groups" % self.n_clusters) seen = {} label = 0 guess = np.random.randint(X.shape[0]) centers = {guess: 0} while label < self.n_clusters: neighbors = idx.get_nns_by_item(guess, _get_num_neighbors()) for point in neighbors: seen[point] = label seen[guess] = label # find a distant point dists = np.array([[idx.get_distance(i, j) for i in centers] for j in range(X.shape[0])]) avg_dists = np.average(dists, axis=1) dist_prob = softmax(avg_dists) guess = np.random.choice(X.shape[0], p=dist_prob) while guess in seen: guess = np.random.choice(X.shape[0], p=dist_prob) centers[guess] = label label = label + 1 y = np.zeros(X.shape[0]) for k, v in seen.items(): y[k] = v return y
def find_nearest(self): ann = AnnoyIndex(num_merchants) for customer in self.customers: customer_vector = list(matrix.loc[[customer]]) ann.add_item(customer, customer_vector) if customer%200 == 0: print 'Adding '+ str(customer) print "Building" if len(self.merchantIDs) > max_trees: ann.build(max_trees) else: ann.build(len(self.merchantIDs)) print "...done" for customer in self.customers: neighbors = ann.get_nns_by_item(customer, num_neighbors) if customer%200 == 0: print "Found neighbors for " + str(customer) self.nearest[customer] = [] for neighbor in neighbors: if neighbor != customer: self.nearest[customer].append((neighbor, ann.get_distance(neighbor, customer)))
def ann_annoy(data, metric='euclidean', n_neighbors=10, trees=10): """My Approximate Nearest Neighbors function (ANN) using the annoy package. Parameters ---------- Returns ------- """ datapoints = data.shape[0] dimension = data.shape[1] # initialize the annoy database ann = AnnoyIndex(dimension) # store the datapoints for (i, row) in enumerate(data): ann.add_item(i, row.tolist()) # build the index ann.build(trees) # find the k-nearest neighbors for all points idx = np.zeros((datapoints, n_neighbors), dtype='int') distVals = idx.copy().astype(np.float) # extract the distance values for i in range(0, datapoints): idx[i,:] = ann.get_nns_by_item(i, n_neighbors) for j in range(0, n_neighbors): distVals[i,j] = ann.get_distance(i, idx[i,j]) return distVals, idx
def ann_annoy(data, metric='euclidean', n_neighbors=10, trees=10): """My Approximate Nearest Neighbors function (ANN) using the annoy package. Parameters ---------- Returns ------- """ datapoints = data.shape[0] dimension = data.shape[1] # initialize the annoy database ann = AnnoyIndex(dimension) # store the datapoints for (i, row) in enumerate(data): ann.add_item(i, row.tolist()) # build the index ann.build(trees) # find the k-nearest neighbors for all points idx = np.zeros((datapoints, n_neighbors), dtype='int') distVals = idx.copy().astype(np.float) # extract the distance values for i in range(0, datapoints): idx[i,:] = ann.get_nns_by_item(i, n_neighbors) for j in range(0, n_neighbors): distVals[i,j] = ann.get_distance(i, idx[i,j]) return distVals, idx
def generate_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test): predictions = model.predict(sequences) t = AnnoyIndex(len(predictions[0]), metric='euclidean') # Length of item vector that will be indexed t.set_seed(123) for i in range(len(predictions)): # print(predictions[i]) v = predictions[i] t.add_item(i, v) t.build(100) # 100 trees match = 0 no_match = 0 ann_accuracy = 0 total = 0 triplets = {} pos_distances = [] neg_distances = [] triplets['anchor'] = [] triplets['positive'] = [] triplets['negative'] = [] if test: NNlen = TEST_NEIGHBOR_LEN else: NNlen = TRAIN_NEIGHBOR_LEN for key in entity2same: index = entity2unique[key] nearest = t.get_nns_by_vector(predictions[index], NNlen) nearest_text = set([unique_text[i] for i in nearest]) expected_text = set(entity2same[key]) # annoy has this annoying habit of returning the queried item back as a nearest neighbor. Remove it. if key in nearest_text: nearest_text.remove(key) # print("query={} names = {} true_match = {}".format(unique_text[index], nearest_text, expected_text)) overlap = expected_text.intersection(nearest_text) # collect up some statistics on how well we did on the match m = len(overlap) match += m # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!) # make sure we adjust our estimate of no match appropriately no_match += min(len(expected_text), NNlen - 1) - m # sample only the negatives that are true negatives # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here' # positives = expected_text - nearest_text positives = overlap negatives = nearest_text - expected_text # print(key + str(expected_text) + str(nearest_text)) for i in negatives: for j in positives: dist_pos = t.get_distance(index, entity2unique[j]) pos_distances.append(dist_pos) dist_neg = t.get_distance(index, entity2unique[i]) neg_distances.append(dist_neg) if dist_pos < dist_neg: ann_accuracy += 1 total += 1 # print(key + "|" + j + "|" + i) # print(dist_pos) # print(dist_neg) for i in negatives: for j in expected_text: triplets['anchor'].append(key) triplets['positive'].append(j) triplets['negative'].append(i) print("mean positive distance:" + str(statistics.mean(pos_distances))) print("stdev positive distance:" + str(statistics.stdev(pos_distances))) print("max positive distance:" + str(max(pos_distances))) print("mean neg distance:" + str(statistics.mean(neg_distances))) print("stdev neg distance:" + str(statistics.stdev(neg_distances))) print("max neg distance:" + str(max(neg_distances))) print("Accuracy in the ANN for triplets that obey the distance func:" + str(ann_accuracy / total)) obj = {} obj['accuracy'] = ann_accuracy / total obj['steps'] = 1 with open(output_file_name_for_hpo, 'w') as out: json.dump(obj, out) if test: return match/(match + no_match) else: return triplets, match/(match + no_match)
tc_index.add_item(i, v) i += 1 tc_index.build(10) # 将这份index存到硬盘 tc_index.save(r'D:\Data_Science\data\Tencent_AILab_ChineseEmbedding\tc_index_build10.index') ''' #后续读取 with open( r'D:\Data_Science\data\Tencent_AILab_ChineseEmbedding\tc_word_index.json', 'r') as fp: word_index = json.load(fp) tc_index = AnnoyIndex(200, metric='angular') tc_index.build(10) tc_index.load( r'D:\Data_Science\data\Tencent_AILab_ChineseEmbedding\tc_index_build10.index' ) # 反向id==>word映射词表 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # get_nns_by_item基于annoy查询词最近的10个向量,返回结果是个list,里面元素是索引 for item in tc_index.get_nns_by_item(word_index[u'卖空'], 10): print(reverse_word_index[item]) # 用每个索引查询word print(tc_index.get_distance(word_index['卫生'], word_index['风险'])) #计算的值是平方距离 #https://github.com/spotify/annoy
imdb_id_gf1 = '0068646' imdb_id_gf2 = '0071562' imdb_id_ts3 = '0435761' imdb_id_fpr = '0120815' imdb_id_fn = '0266543' # find 10 closest matches for ann_index in t.get_nns_by_item(imdb_to_index[imdb_id_ts3], 10): imdb_id = index_to_imdb[ann_index] movie_title = cdb.get(str(imdb_id))['title'] print movie_title, t.get_item_vector(ann_index) # distances between some movies print "GF1 <-> GF2", t.get_distance(imdb_to_index[imdb_id_gf1], imdb_to_index[imdb_id_gf2]) print "GF1 <-> SR", t.get_distance(imdb_to_index[imdb_id_gf1], imdb_to_index[imdb_id_sr]) print "GF2 <-> SR", t.get_distance(imdb_to_index[imdb_id_gf2], imdb_to_index[imdb_id_sr]) print "GF1 <-> TS3", t.get_distance(imdb_to_index[imdb_id_gf1], imdb_to_index[imdb_id_ts3]) print "FPR <-> FN", t.get_distance(imdb_to_index[imdb_id_fpr], imdb_to_index[imdb_id_fn]) with open('closest_matches.csv', 'wb') as f: writer = csv.writer(f) # find the closest match for each movie for imdb_id in imdb_to_index: closest_match = t.get_nns_by_item(imdb_to_index[imdb_id], 2)[1] distance = t.get_distance(imdb_to_index[imdb_id], closest_match) movie1_name = cdb.get(str(imdb_id))['title'] movie2_name = cdb.get(str(index_to_imdb[closest_match]))['title'] writer.writerow([imdb_id, movie1_name, index_to_imdb[closest_match], movie2_name, distance])
# ... all_texts = np.concatenate((texts1, texts2)) match = 0 no_match = 0 print("shape of annoy data1") print(annoy_data1[0].shape) print(tr_pairs[:, 0].shape) for index in range(len(texts1)): nearest = t.get_nns_by_vector(mid_predictions[index], 2) print("query={} names = {} true_match = {}".format(texts1[index], [all_texts[i] for i in nearest], texts2[index])) for i in nearest: print(t.get_distance(index, i)) print(model.predict([np.array([annoy_data1[index]]), np.array([annoy_data2[i - len(annoy_data1)]])])) print(t.get_distance(index, index + len(texts1))) print(model.predict([np.array([annoy_data1[index]]), np.array([annoy_data2[index]])])) if (index + len(texts1)) in nearest: match += 1 else: no_match += 1 print("match: {} no_match: {}".format(match, no_match)) print("Machine Learning Accuracy") print(tr_acc) print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
t = AnnoyIndex(100, 'angular') if __name__ == '__main__': counter = 0 for batch in tqdm(dataloader): for _, vec in enumerate(batch): t.add_item(counter, vec) counter += 1 # build the tree t.build(100) # save the tree t.save('./trees/test.tree') start = timer() print(t.get_nns_by_item(0, 10)) end = timer() print('before filter->', end - start) start = timer() print([ i for i in t.get_nns_by_item(0, 100) if round(t.get_distance(0, i), 2) == 0.0 ]) end = timer() print('after filter->', end - start) # query using a vector # print(t.get_nns_by_vector(..., 10))
class AnnoyModel(object): def __init__(self, metric_name, n_trees=10, distance_type='angular', load_existing=False): """ Args: - metric_name: the name of the metric that vectors in the index will represent, a string. - n_trees: the number of trees used in building the index, a positive integer. - distance_type: distance measure, a string. Possibilities are "angular", "euclidean", "manhattan", "hamming", or "dot". - load_existing: if load_existing is True, then load function will be called upon initialization. """ # Check params self.parse_initial_params(metric_name, n_trees, distance_type) self.dimensionality = db.similarity.get_metric_dimensionality( self.metric_name) self.index = AnnoyIndex(self.dimensionality, metric=self.distance_type) # in_loaded_state set to True if the index is built, loaded, or saved. # At any of these points, items can no longer be added to the index. self.in_loaded_state = False if load_existing: self.load() def parse_initial_params(self, metric_name, n_trees, distance_type): # Validate the index parameters passed to AnnoyModel. if metric_name in BASE_INDICES: self.metric_name = metric_name else: raise similarity.exceptions.IndexNotFoundException( 'Index for specified metric is not possible.') if distance_type in BASE_INDICES[self.metric_name]: self.distance_type = distance_type else: raise similarity.exceptions.IndexNotFoundException( 'Index for specified distance_type is not possible.') if n_trees in BASE_INDICES[self.metric_name][self.distance_type]: self.n_trees = n_trees else: raise similarity.exceptions.IndexNotFoundException( 'Index for specified number of trees is not possible.') def build(self): """Build and load the index using the specified number of trees. An index must be built before it can be queried.""" n_jobs = current_app.config['SIMILARITY_BUILD_NUM_JOBS'] self.index.build(n_trees=self.n_trees, n_jobs=n_jobs) self.in_loaded_state = True def save(self, location=None, name=None): # Save and load the index using the metric name. if not self.in_loaded_state: raise similarity.exceptions.LoadStateException( 'Index must be built before saving.') if not location: location = current_app.config['SIMILARITY_INDEX_DIR'] try: os.makedirs(location) except OSError: if not os.path.isdir(location): raise name = '_'.join( [name or self.metric_name, self.distance_type, str(self.n_trees)]) + '.ann' file_path = os.path.join(location, name) self.index.save(file_path) def load(self, name=None): """ Args: name: name of the metric that should be loaded. If None, it will use the metric specified when initializing the index. Raises: IndexNotFoundException: if there is no saved index with the given parameters. """ # Load and build an existing annoy index. file_path = current_app.config['SIMILARITY_INDEX_DIR'] name = '_'.join( [name or self.metric_name, self.distance_type, str(self.n_trees)]) + '.ann' full_path = os.path.join(file_path, name) try: self.index.load(full_path) self.in_loaded_state = True except IOError: raise similarity.exceptions.IndexNotFoundException def add_recording_by_mbid(self, mbid, offset): """Add a single recording specified by (mbid, offset) to the index. Note that when adding a single recording, space is allocated for the lowlevel.id + 1 items. """ if self.in_loaded_state: raise similarity.exceptions.CannotAddItemException( "Item cannot be added once index is in load state.") item = db.similarity.get_similarity_by_mbid(mbid, offset) if item: recording_vector = item[self.metric_name] id = item['id'] # If an item already exists, this should not error # and we should not add the item. try: self.index.get_item_vector(id) except IndexError: self.index.add_item(id, recording_vector) def add_recording_by_id(self, id): """Add a single recording specified by its lowlevel.id to the index. Note that when adding a single recording, space is allocated for lowlevel.id + 1 items. """ if self.in_loaded_state: raise similarity.exceptions.CannotAddItemException( "Item cannot be added once index is in load state.") item = db.similarity.get_similarity_row_id(id) if item: # If an item already exists, this should not error # and we should not add the item. try: self.index.get_item_vector(id) except IndexError: self.index.add_item(item["id"], item[self.metric_name]) def add_recording_with_vector(self, id, vector): """Add a single recording to the index using its lowlevel.id and a precomputed metric vector. Args: id: non-negative integer lowlevel.id for a recording. vector: metric vector (list) corresponding to the lowlevel.id. Dimensionality of the vector must match the dimensionality with which the index is initialized *NOTE*: Annoy will allocate memory for max(n) + 1 items. """ if self.in_loaded_state: raise similarity.exceptions.CannotAddItemException( "Item cannot be added once index is in load state.") if not len(vector) == self.dimensionality: raise similarity.exceptions.CannotAddItemException( "Dimensionality of vector provided does not match index dimensionality." ) self.index.add_item(id, vector) def get_nns_by_id(self, id, num_neighbours): """Get the most similar recordings for a recording with the specified id. Args: id: non-negative integer lowlevel.id for a recording. num_neighbours: positive integer, number of similar recordings to be returned in the query. Returns: A list of the form [<lowlevel.ids>, <recordings>, <distances>] where <lowlevel.ids> is a list of lowlevel.ids [id_1, ..., id_n], <recordings> is a list of tuples (MBID, offset), and <distances> is a list of distances, corresponding to each similar recording. """ try: items = self.index.get_nns_by_item(id, num_neighbours, include_distances=True) except IndexError: raise similarity.exceptions.ItemNotFoundException( 'The item you are requesting is not indexed.') # Unpack to get ids and distances ids = items[0] distances = items[1] recordings = db.data.get_mbids_by_ids(ids) return ids, recordings, distances def get_nns_by_mbid(self, mbid, offset, num_neighbours): # Find corresponding lowlevel.id to (mbid, offset) combination, # then call get_nns_by_id lookup = self.get_bulk_nns_by_mbid([(mbid, offset)], num_neighbours) return lookup.get(mbid, {}).get(str(offset), []) def get_bulk_nns_by_mbid(self, recordings, num_neighbours): """Get most similar recordings for each (MBID, offset) tuple provided. Similar recordings list returned is ordered with the most similar at index 0. Arguments: recordings: a list of tuples of form (MBID, offset), for which similar recordings will be found num_neighbours (int): the number of similar recordings desired for each recording specified. Returns: a dictionary of the mbids and offsets given in the recordings parameter. Each item is a list of dictionaries, containing the keys recording_mbid, offset, and distance: {"mbid1": {"offset1": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...], ..., "offsetn": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...] }, ..., "mbidn": {"offset1": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...], ..., "offsetn": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...] } } """ recordings_info = defaultdict(dict) ids = db.data.get_ids_by_mbids(recordings) for recording_id, (mbid, offset) in zip(ids, recordings): try: ids, similar_recordings, distances = self.get_nns_by_id( recording_id, num_neighbours) data = [] for recording, distance in zip(similar_recordings, distances): data.append({ 'recording_mbid': recording[0], 'offset': recording[1], 'distance': distance }) recordings_info[mbid][str(offset)] = data except (similarity.exceptions.ItemNotFoundException, db.exceptions.NoDataFoundException): continue return recordings_info def get_similarity_between(self, rec_one, rec_two): """Get the distance of the similarity measure between two recordings. Args: rec_one and rec_two are tuples of the form (MBID, offset) Returns: Distance between two recordings, of type float. If an IndexError occurs (one or more of ids is not indexed) then None is returned. """ id_1, id_2 = db.data.get_ids_by_mbids([rec_one, rec_two]) if id_1 is None or id_2 is None: return None try: return self.index.get_distance(id_1, id_2) except IndexError: return None
class WordVecSpaceAnnoy(WordVecSpaceDisk): N_TREES = 1 METRIC = "angular" ANN_FILE = "vectors.ann" def __init__(self, input_dir, n_trees=N_TREES, metric=METRIC, index_fpath=None): super().__init__(input_dir) self.ann = AnnoyIndex(self.dim, metric=metric) self.ann_file = self.J(input_dir, self.ANN_FILE) if index_fpath: self.ann_file = self.J(index_fpath, self.ANN_FILE) self._create_annoy_file(n_trees) self.ann.load(self.ann_file) def J(self, p1, p2): return os.path.join(p1, p2) def _create_annoy_file(self, n_trees): for i in range(self.nvecs): v = self.vecs[i] self.ann.add_item(i, v) self.ann.build(n_trees) self.ann.save(self.ann_file) def get_distance( self, word_or_index1: Union[int, str, np.ndarray], word_or_index2: Union[int, str, np.ndarray], ): v1 = self._check_index_or_word(word_or_index1) v2 = self._check_index_or_word(word_or_index2) return self.ann.get_distance(v1, v2) def get_distances( self, row_words_or_indices: Union[int, str, np.ndarray], col_words_or_indices: Union[int, str, np.ndarray, None] = None, ): r = row_words_or_indices c = col_words_or_indices if not isinstance(r, (list, tuple, np.ndarray)): r = [r] if c: if not isinstance(c, (list, tuple, np.ndarray)): c = [c] mat = self._make_array(shape=((len(r)), len(c)), dtype=np.float32) for i, row_word in enumerate(r): dist = [] for col_word in c: dist.append(self.get_distance(row_word, col_word)) mat[i] = np.asarray(dist, dtype=np.float32) else: mat = self._make_array(shape=((len(r)), self.nvecs), dtype=np.float32) dist = {} for i, row_word in enumerate(r): index = self._check_index_or_word(row_word) key, val = self.ann.get_nns_by_item(index, self.nvecs, include_distances=True) for k, v in zip(key, val): dist[k] = v mat[i] = np.asarray( [dist[key] for key in sorted(dist.keys(), reverse=False)], dtype=np.float32, ) return mat DEFAULT_K = 512 def get_nearest( self, v_w_i: Union[int, str, np.ndarray], k: int = DEFAULT_K, combination: bool = False, ): if isinstance(v_w_i, (tuple, list)): res = [] for word in v_w_i: index = self._check_index_or_word(word) if index: res.append(self.ann.get_nns_by_item( index, k)) # will find the k nearest neighbors # will find common nearest neighbors among given words if combination and len(v_w_i) > 1: return list(set(res[0]).intersection(*res)) return res index = self._check_index_or_word(v_w_i) return self.ann.get_nns_by_item(index, k)
class Recommender: def __init__(self): # connect to couchdb couch = couchdb.Server() couch.resource.credentials = ("imdb", "imdb") cdb = couch["filmdings3"] # fetch all sentiments all_sentiments = cdb.view("filmdings_design/sentiments_for_id") self.imdb_to_index = {} self.index_to_imdb = {} f = 10 # dimensions (= different sentiments) self.t = AnnoyIndex(f, metric="angular") for i in xrange(len(all_sentiments)): row = all_sentiments.rows[i] imdb_id = row.key s = row.value v = [ s["anger"], s["anticipation"], s["disgust"], s["fear"], s["joy"], s["negative"], s["positive"], s["sadness"], s["surprise"], s["trust"], ] self.t.add_item(i, v) # we have to keep track which index belongs to which imdb id (and vice versa): self.imdb_to_index[imdb_id] = i self.index_to_imdb[i] = imdb_id self.t.build(100) # number of trees (higher accuracy, but needs more memory) def recommend_for_filter(self, s): v = [int(s[0]), int(s[1]), int(s[2]), int(s[3]), int(s[4]), 0, 0, int(s[5]), int(s[6]), int(s[7])] print v movie_ids = [] for ann_index in self.t.get_nns_by_vector(v, 500): imdb_id = self.index_to_imdb[ann_index] movie_ids.append(imdb_id) return movie_ids def recommend(self, movie_answers): votes_per_movie = {} movies_to_recommend = {} for movie in movie_answers: print "checking for", movie, "answer=", movie_answers[movie] for ann_index in self.t.get_nns_by_item(self.imdb_to_index[movie], 1000): imdb_id = self.index_to_imdb[ann_index] if not imdb_id in movies_to_recommend: movies_to_recommend[imdb_id] = {} movies_to_recommend[imdb_id]["votes"] = 0 movies_to_recommend[imdb_id]["distances"] = [] movies_to_recommend[imdb_id]["matched_by"] = [] if movie_answers[movie] == "yes": movies_to_recommend[imdb_id]["votes"] += 1 movies_to_recommend[imdb_id]["matched_by"].append(movie) else: movies_to_recommend[imdb_id]["votes"] -= 1 distance = self.t.get_distance(ann_index, self.imdb_to_index[movie]) movies_to_recommend[imdb_id]["distances"].append(distance) # sort movies by number of votes and average distance dicts = movies_to_recommend.items() dicts.sort( key=lambda (k, d): (d["votes"], 1 / (sum(d["distances"]) / float(len(d["distances"])))), reverse=True ) best_movie_ids = [] for tupel in dicts[0:50]: movie_id = tupel[0] # only return movies that are not in movie_answers (user has already rated them, would be stupid to recommend them again) if not movie_id in movie_answers: best_movie_ids.append(movie_id) print best_movie_ids return best_movie_ids
class W2V_ANN(Model): def __init__(self, config): self.requirement = [ 'test_file', 'lastN', 'topN', 'type', 'item_vec_file', 'index_file_file' ] self.config = config miss = set() for item in self.requirement: if item not in self.config: miss.add(item) if len(miss) > 0: raise Exception(f"Miss the key : {miss}") Model.__init__(self, self.config['test_file'], self.config['lastN'], self.config['topN']) self.type = config['type'] # behavior / item self.action_w = {'ViewContent': 1.0, 'AddToCart': 3.0, 'revenue': 6.0} def train(self): b_time = time.time() self.item_idx = {} self.item_idx_reverse = {} tmp_vector = {} with open(self.config['item_vec_file'], 'r') as in_f: num_items, dim = in_f.readline().strip().split() print(f'Num of items : {num_items}, dim : {dim}') self.t = AnnoyIndex(int(dim), 'angular') for idx, line in tqdm(enumerate(in_f)): tmp = line.split() if self.type == 'item': try: action, item_org = tmp[0].split(':', 1) except: continue else: action = None item_org = tmp[0] if item_org not in self.item_idx: self.item_idx[item_org] = idx self.item_idx_reverse[idx] = item_org tmp_vector[idx] = np.zeros(int(dim)) else: idx = self.item_idx[item_org] if self.type == 'item': tmp_vector[idx] += np.array( tmp[1:], dtype=float) * self.action_w[action] else: tmp_vector[idx] += np.array(tmp[1:], dtype=float) for idx in tmp_vector: self.t.add_item(idx, tmp_vector[idx].tolist()) print("Read file finished ...") file_name = self.config['index_file_file'] + '.' + self.type self.t.build(30) # 10 trees self.t.save(f'{file_name}.ann') # self.t.load(f'{file_name}.ann') print(f"Train finished ...{time.time() - b_time}") def predict(self, last_n_events, topN): b_time = time.time() candidate_set = set() if self.type == 'item': last_n_items = [ self.item_idx[e.split(':', 1)[1]] for e in last_n_events[::-1] if e.split(':', 1)[1] in self.item_idx ] else: last_n_items = [ self.item_idx[e] for e in last_n_events[::-1] if e in self.item_idx ] if len(last_n_items) == 0: return [] for item_idx in last_n_items: candidate = self.__item_topK_similar(item_idx, topN) candidate_set.update(candidate) candidate_set -= set(last_n_items) candidate_list = list(candidate_set) score_matric = np.zeros((len(last_n_items), len(candidate_list))) for i, item_id in enumerate(last_n_items): score_matric[i] = self.__item_item_arr_norm_score( item_id, candidate_list) rank_weight = np.array( [1 / np.log2(rank + 2) for rank in range(len(last_n_items))]) final_score = rank_weight.dot(score_matric).tolist() # print(last_n_items, list(zip(candidate_list, final_score))) final_items = sorted(zip(candidate_list, final_score), key=lambda x: x[1], reverse=True) # print(f"[Time|Preict] {time.time()-b_time}") res = [] for item, score in final_items: try: if self.type == 'item': item_raw = self.item_idx_reverse[item] else: item_raw = self.item_idx_reverse[item].split(':', 1)[1] if item_raw in res: continue res.append(item_raw) except: pass if len(res) == topN: break return res def __item_topK_similar(self, given_idx, topK): return self.t.get_nns_by_item(given_idx, topK) def __item_item_arr_norm_score(self, given_idx, candidate_idx_arr): res = [ 1 - self.t.get_distance(given_idx, candidate_idx) for candidate_idx in candidate_idx_arr ] # print(given_idx, sorted(zip(candidate_idx_arr, res), key=lambda x:x[1], reverse=True)) res = np.array(res) return res / np.linalg.norm(res)
class ProbCalculator(): def __init__(self): ## fixme: 静态部分改成离线计算好的dict from annoy import AnnoyIndex from utils import pickle_load self.ann_index = AnnoyIndex(300, metric='euclidean') self.ann_index.load('data/index/annoy.euclidean.idx') self.id2word = pickle_load('data/index/id2word.pkl') self.word2id = pickle_load('data/index/word2id.pkl') def calc_probs(self, char, candidates): static_probs = self._static_probs(char, candidates) probs = static_probs # 下面的动态prob没卵用 # if static_probs is not None: # dynamic_probs = self._dynamic_probs(candidates, tokens, idx) # joint_probs = [0.4 * sp + 0.6 * dp for sp, dp in zip(static_probs, dynamic_probs)] # probs_sum = sum(joint_probs) # if probs_sum > 0: # probs = [p / probs_sum for p in joint_probs] return probs def _static_probs(self, char, candidates): """ 计算静态的emb相似度 :param char: :param candidates: :return: """ probs = None if char in self.word2id: # char_vec = np.array(self.ann_index.get_item_vector(self.word2id[char])) # unk_sim = 1 / np.sqrt(sum(char_vec ** 2)) # unk相当于vec=np.zeros, 所以dis=|v[char]|, 但在欧式距离下,这个sim会比大部分非unk的词要高 unk_sim = 0 # probs_sum = 0 probs = [] for candidate in candidates: sim = unk_sim if candidate in self.word2id: sim = 1 / self.ann_index.get_distance( self.word2id[char], self.word2id[candidate]) # 上面的代码已经过滤掉了dis=0的情况 probs.append(sim) probs_sum += sim if probs_sum > 0: probs = [p / probs_sum for p in probs] else: probs = None return probs def _dynamic_probs(self, candidates, tokens, idx): """ 根据上下文计算candidate对edit distance和jaccard的影响 :param char: :param candidates: :param tokens: :param idx: :return: """ # target_token = tokens[idx] # t_set = set(target_token) # len_target_token = len(target_token) # sentence = ''.join(tokens) # probs = [] # for candidate in candidates: # if len(candidate) == 0: # probs.append(0.5) # continue # c_set = set(candidate) # delta_len = len(c_set) - len(c_set & t_set) # edit_sim = 1 - delta_len / (len(sentence) + 1e-8) # jaccard_sim = len(c_set & set(sentence)) / len(c_set) # sim = 0.4 * edit_sim + 0.6 * jaccard_sim # probs.append(sim) probs = [] s_set = set(''.join(tokens)) for candidate in candidates: prob = 0 for c in candidate: if c in s_set: prob = 1 break probs.append(prob) probs_sum = sum(probs) if probs_sum > 0: probs = [p / probs_sum for p in probs] else: probs = [1] * len(candidates) return probs
def generate_graph(img, graph_type='grid', d=10, n_tree=10, search_k=-1): ''' assume the image is channel last. generate grid graph or nearest neighbor graph return a list of sorted edge ''' #Arguments: #img: A numpy array, channel last #graph_type #n_tree: larger tree, more precise img = _preprocessing(img) graphs = [] rows = img.shape[0] cols = img.shape[1] num_vertices = rows * cols num_edges = (rows - 1) * cols + (cols - 1) * rows if graph_type == 'grid': #generate grid_graph for c in range(img.shape[2]): #create edges array edges = np.empty((num_edges, 3), dtype=np.float64) #insert edge in edges array #we can see that each node connect to its right and down node #except the most right columns don't have right connection #and the bottom row don't have down down connection index = 0 for i in range(rows): for j in range(cols): #add edge with current node to its right node #if not the most right columns if j < cols - 1: edges[index][0] = i * cols + j edges[index][1] = i * cols + j + 1 edges[index][2] = abs(img[i][j][c] - img[i][j + 1][c]) #weight index += 1 #add edge with current node to its down node #if not the most right columns if i < rows - 1: edges[index][0] = i * cols + j edges[index][1] = (i + 1) * cols + j edges[index][2] = abs(img[i][j][c] - img[i + 1][j][c]) #weight index += 1 edges = edges[edges[:, 2].argsort()] graphs.append(edges) elif graph_type == 'nn': #generate nearest neighbor graphs #using ANN to find 10 nearest neighbor of each pixel #using Annoy library f = 5 t = AnnoyIndex(5, 'euclidean') nn_graph = [] rows = img.shape[0] cols = img.shape[1] for i in range(rows): for j in range(cols): v = [img[i, j, 0], img[i, j, 1], img[i, j, 2], i, j] t.add_item(i * cols + j, v) t.build(n_tree) for i in range(rows * cols): for neighbor in t.get_nns_by_item(i, d): if neighbor > i: nn_graph.append([i, neighbor, t.get_distance(i, neighbor)]) elif neighbor < i: nn_graph.append([neighbor, i, t.get_distance(i, neighbor)]) nn_graph = np.array(nn_graph) nn_graph = nn_graph[np.unique(nn_graph[:, :2], axis=0, return_index=True)[1]] graphs.append(nn_graph[nn_graph[:, 2].argsort()]) else: raise ValueError('No such graph type, must be \'grid\' or \'grid\'') print('------------------------------------------') print(f'{graph_type} type graph construction done') print('------------------------------------------') return graphs
def test_rounding_error(self): # https://github.com/spotify/annoy/issues/314 i = AnnoyIndex(1, 'euclidean') i.add_item(0, [0.7125930]) i.add_item(1, [0.7123166]) self.assertGreater(i.get_distance(0, 1), 0.0)
def generate_triplets(X, n_inlier, n_outlier, n_random, fast_trimap = True, weight_adj = False, verbose = True): n, dim = X.shape if dim > 100: X = TruncatedSVD(n_components=100, random_state=0).fit_transform(X) dim = 100 exact = n <= 10000 n_extra = min(max(n_inlier, 200),n) if exact: # do exact knn search knn_tree = knn(n_neighbors= n_extra, algorithm='auto').fit(X) distances, nbrs = knn_tree.kneighbors(X) elif fast_trimap: # use annoy tree = AnnoyIndex(dim) for i in range(n): tree.add_item(i, X[i,:]) tree.build(50) nbrs = np.empty((n,n_extra), dtype=np.int64) distances = np.empty((n,n_extra), dtype=np.float64) dij = np.empty(n_extra, dtype=np.float64) for i in range(n): nbrs[i,:] = tree.get_nns_by_item(i, n_extra) for j in range(n_extra): dij[j] = euclid_dist(X[i,:], X[nbrs[i,j],:]) sort_indices = np.argsort(dij) nbrs[i,:] = nbrs[i,sort_indices] # for j in range(n_extra): # distances[i,j] = tree.get_distance(i, nbrs[i,j]) distances[i,:] = dij[sort_indices] else: n_bf = 10 n_extra += n_bf knn_tree = knn(n_neighbors= n_bf, algorithm='auto').fit(X) _, nbrs_bf = knn_tree.kneighbors(X) nbrs = np.empty((n,n_extra), dtype=np.int64) nbrs[:,:n_bf] = nbrs_bf tree = AnnoyIndex(dim) for i in range(n): tree.add_item(i, X[i,:]) tree.build(60) distances = np.empty((n,n_extra), dtype=np.float64) dij = np.empty(n_extra, dtype=np.float64) for i in range(n): nbrs[i,n_bf:] = tree.get_nns_by_item(i, n_extra-n_bf) unique_nn = np.unique(nbrs[i,:]) n_unique = len(unique_nn) nbrs[i,:n_unique] = unique_nn for j in range(n_unique): dij[j] = euclid_dist(X[i,:], X[nbrs[i,j],:]) sort_indices = np.argsort(dij[:n_unique]) nbrs[i,:n_unique] = nbrs[i,sort_indices] distances[i,:n_unique] = dij[sort_indices] if verbose: print("found nearest neighbors") sig = np.maximum(np.mean(distances[:, 10:20], axis=1), 1e-20) # scale parameter P = find_p(distances, sig, nbrs) triplets = sample_knn_triplets(P, nbrs, n_inlier, n_outlier) n_triplets = triplets.shape[0] outlier_dist = np.empty(n_triplets, dtype=np.float64) if exact or not fast_trimap: for t in range(n_triplets): outlier_dist[t] = np.sqrt(np.sum((X[triplets[t,0],:] - X[triplets[t,2],:])**2)) else: for t in range(n_triplets): outlier_dist[t] = tree.get_distance(triplets[t,0], triplets[t,2]) weights = find_weights(triplets, P, nbrs, outlier_dist, sig) if n_random > 0: rand_triplets = sample_random_triplets(X, n_random, sig) rand_weights = rand_triplets[:,-1] rand_triplets = rand_triplets[:,:-1].astype(np.int64) triplets = np.vstack((triplets, rand_triplets)) weights = np.hstack((weights, rand_weights)) weights /= np.max(weights) weights += 0.0001 if weight_adj: weights = np.log(1 + 50 * weights) weights /= np.max(weights) return (triplets, weights)