Beispiel #1
1
 def test_metric_kwarg(self):
     # Issue 211
     i = AnnoyIndex(2, metric='euclidean')
     i.add_item(0, [1, 0])
     i.add_item(1, [9, 0])
     self.assertAlmostEqual(i.get_distance(0, 1), 8)
     self.assertEqual(i.f, 2)
Beispiel #2
1
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
Beispiel #3
0
 def test_basic_conversion(self):
     f = 100
     i = AnnoyIndex(f, 'hamming')
     u = numpy.random.binomial(1, 0.5, f)
     v = numpy.random.binomial(1, 0.5, f)
     i.add_item(0, u)
     i.add_item(1, v)
     u2 = i.get_item_vector(0)
     v2 = i.get_item_vector(1)
     self.assertAlmostEqual(numpy.dot(u - u2, u - u2), 0.0)
     self.assertAlmostEqual(numpy.dot(v - v2, v - v2), 0.0)
     self.assertAlmostEqual(i.get_distance(0, 0), 0.0)
     self.assertAlmostEqual(i.get_distance(1, 1), 0.0)
     self.assertAlmostEqual(i.get_distance(0, 1), numpy.dot(u - v, u - v))
     self.assertAlmostEqual(i.get_distance(1, 0), numpy.dot(u - v, u - v))
Beispiel #4
0
    def test_dist_degen(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [1, 0])
        i.add_item(1, [0, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 2.0**0.5)
Beispiel #5
0
    def test_dist_2(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [1000, 0])
        i.add_item(1, [10, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 0)
Beispiel #6
0
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), (2 * (1.0 - 2 ** -0.5))**0.5)
Beispiel #7
0
    def t1est_dist_2(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 2
        i = AnnoyIndex(f, 2, "test_db", 64,  1000, 3048576000)
        i.add_item(0, [1000, 0])
        i.add_item(1, [10, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 0)
Beispiel #8
0
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f, 'manhattan')
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])
        i.add_item(2, [0, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
        self.assertAlmostEqual(i.get_distance(1, 2), 2.0)
Beispiel #9
0
    def test_dist_3(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [97, 0])
        i.add_item(1, [42, 42])

        dist = ((1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2)**0.5

        self.assertAlmostEqual(i.get_distance(0, 1), dist)
Beispiel #10
0
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f, 'manhattan')
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])
        i.add_item(2, [0, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
        self.assertAlmostEqual(i.get_distance(1, 2), 2.0)
Beispiel #11
0
    def test_dist_3(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [97, 0])
        i.add_item(1, [42, 42])

        dist = ((1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2)**0.5

        self.assertAlmostEqual(i.get_distance(0, 1), dist)
Beispiel #12
0
    def test_dist_2(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 2
        i = AnnoyIndex(f, 2, "test_db", 64,  1000, 3048576000, 0)
        i.add_item(0, [1000, 0])
        i.add_item(1, [10, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 0)
Beispiel #13
0
def generate_triplets(
    X,
    n_inliers,
    n_outliers,
    n_random,
    distance="euclidean",
    weight_adj=500.0,
    verbose=True,
):
    distance_dict = {
        "euclidean": 0,
        "manhattan": 1,
        "angular": 2,
        "hamming": 3
    }
    distance_index = distance_dict[distance]
    n, dim = X.shape
    n_extra = min(n_inliers + 50, n)
    tree = AnnoyIndex(dim, metric=distance)
    for i in range(n):
        tree.add_item(i, X[i, :])
    tree.build(20)
    nbrs = np.empty((n, n_extra), dtype=np.int32)
    knn_distances = np.empty((n, n_extra), dtype=np.float32)
    for i in range(n):
        nbrs[i, :] = tree.get_nns_by_item(i, n_extra)
        for j in range(n_extra):
            knn_distances[i, j] = tree.get_distance(i, nbrs[i, j])
    if verbose:
        print("found nearest neighbors")
    sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1),
                     1e-10)  # scale parameter
    P = find_p(knn_distances, sig, nbrs)
    triplets = sample_knn_triplets(P, nbrs, n_inliers, n_outliers)
    n_triplets = triplets.shape[0]
    outlier_distances = np.empty(n_triplets, dtype=np.float32)
    for t in range(n_triplets):
        outlier_distances[t] = calculate_dist(X[triplets[t, 0], :],
                                              X[triplets[t, 2], :],
                                              distance_index)
    weights = find_weights(triplets, P, nbrs, outlier_distances, sig)
    if n_random > 0:
        rand_triplets = sample_random_triplets(X, n_random, sig,
                                               distance_index)
        rand_weights = rand_triplets[:, -1]
        rand_triplets = rand_triplets[:, :-1].astype(np.int32)
        triplets = np.vstack((triplets, rand_triplets))
        weights = np.hstack((weights, rand_weights))
    weights[np.isnan(weights)] = 0.0
    weights /= np.max(weights)
    weights += 0.0001
    if weight_adj:
        if not isinstance(weight_adj, (int, float)):
            weight_adj = 500.0
        weights = np.log(1 + weight_adj * weights)
        weights /= np.max(weights)
    return (triplets, weights)
Beispiel #14
0
    def test_dist_3(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 2
        i = AnnoyIndex(f, 2, "test_db", 64,  1000, 3048576000, 0)
        i.add_item(0, [97, 0])
        i.add_item(1, [42, 42])

        dist = (1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2

        self.assertAlmostEqual(i.get_distance(0, 1), dist)
Beispiel #15
0
    def test_dist(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        
        f = 2   
        i = AnnoyIndex(f,  2, "test_db", 64,  1000, 3048576000, 0)
        # i.verbose(True)
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5))
Beispiel #16
0
def generate_triplets(
    X,
    n_inliers,
    n_outliers,
    n_random,
    distance="euclidean",
    verbose=False,
    weight_temp=0.5,
):
    distance_dict = {
        "euclidean": 0,
        "manhattan": 1,
        "angular": 2,
        "hamming": 3
    }
    distance_index = distance_dict[distance]
    n, dim = X.shape
    n_extra = min(n_inliers + _NUM_EXTRA_KNN, n)
    tree = AnnoyIndex(dim, metric=distance)
    for i in range(n):
        tree.add_item(i, X[i, :])
    tree.build(_NUM_TREES)
    nbrs = np.empty((n, n_extra), dtype=np.int32)
    knn_distances = np.empty((n, n_extra), dtype=np.float32)
    for i in range(n):
        nbrs[i, :] = tree.get_nns_by_item(i, n_extra)
        for j in range(n_extra):
            knn_distances[i, j] = tree.get_distance(i, nbrs[i, j])
    if verbose:
        print("found nearest neighbors")
    sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1),
                     1e-10)  # scale parameter
    P = find_p(knn_distances, sig, nbrs)
    triplets = sample_knn_triplets(P, nbrs, n_inliers, n_outliers)
    n_triplets = triplets.shape[0]
    outlier_distances = np.empty(n_triplets, dtype=np.float32)
    for t in range(n_triplets):
        outlier_distances[t] = calculate_dist(X[triplets[t, 0], :],
                                              X[triplets[t, 2], :],
                                              distance_index)
    weights = find_weights(triplets, P, nbrs, outlier_distances, sig)
    if n_random > 0:
        rand_triplets = sample_random_triplets(X, n_random, sig,
                                               distance_index)
        rand_weights = rand_triplets[:, -1]
        rand_triplets = rand_triplets[:, :-1].astype(np.int32)
        triplets = np.vstack((triplets, rand_triplets))
        weights = np.hstack((weights, _RAND_WEIGHT_SCALE * rand_weights))
    weights[np.isnan(weights)] = 0.0
    weights -= np.min(weights)
    weights = tempered_log(1.0 + weights, weight_temp)
    return (triplets, weights)
Beispiel #17
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'dot')
     for j in range(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(
                 dist, numpy.dot(i.get_item_vector(a),
                                 i.get_item_vector(b)))
             self.assertAlmostEqual(dist, i.get_distance(a, b))
Beispiel #18
0
def do_annoy(model, texts, tokenizer, verbose):
    unique_text = []
    entity_idx = []
    entity2same = {}

    for i in range(len(texts['anchor'])):
        if not texts['anchor'][i] in entity2same:
            entity2same[texts['anchor'][i]] = []
            entity_idx.append(len(unique_text))
            unique_text.append(texts['anchor'][i])
        l = entity2same[texts['anchor'][i]]
        if texts['positive'][i] not in l:
            entity2same[texts['anchor'][i]].append(texts['positive'][i])
            unique_text.append(texts['positive'][i])

    print(entity2same)
    print(unique_text)

    sequences = tokenizer.texts_to_sequences(unique_text)
    sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(sequences)

    t = AnnoyIndex(
        len(predictions[0]),
        metric='euclidean')  # Length of item vector that will be indexed
    for i in range(len(predictions)):
        v = predictions[i]
        t.add_item(i, v)

    t.build(100)  # 100 trees

    match = 0
    no_match = 0

    for index in entity_idx:
        nearest = t.get_nns_by_vector(predictions[index], 5)
        print(nearest)
        nearest_text = set([unique_text[i] for i in nearest])
        expected_text = set(entity2same[unique_text[index]])
        nearest_text.remove(unique_text[index])
        print("query={} names = {} true_match = {}".format(
            unique_text[index], nearest_text, expected_text))
        if verbose:
            print([t.get_distance(index, i) for i in nearest])
        overlap = expected_text.intersection(nearest_text)
        print(overlap)
        m = len(overlap)
        match += m
        no_match += len(expected_text) - m

    print("match: {} no_match: {}".format(match, no_match))
Beispiel #19
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'manhattan')
     for j in range(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v)))
             self.assertAlmostEqual(dist, sum([abs(float(x)-float(y)) for x, y in zip(u, v)]))
Beispiel #20
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'manhattan')
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v)))
             self.assertAlmostEqual(dist, sum([abs(float(x)-float(y)) for x, y in zip(u, v)]))
Beispiel #21
0
    def t1est_dist(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        
        f = 2   
        i = AnnoyIndex(f,  2, "test_db", 64,  1000, 3048576000)
        
        print "creating object"
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        print "creating object"
        self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5))
        print "done"
Beispiel #22
0
class KNN(object):
    def __init__(self, label: Label, model: Model):
        self.tree = AnnoyIndex(SIZE[model], "angular")
        self.tree.load(TREE[model][label])

    def nearest_index(self, y_pred: np.ndarray) -> int:
        return self.tree.get_nns_by_vector(vector=y_pred.tolist(), n=1)[0]

    def nearest(self, y_pred: np.ndarray) -> np.ndarray:
        index = self.nearest_index(y_pred=y_pred)
        return np.asarray(self.tree.get_item_vector(index))

    def distance(self, left_index: int, right_index: int) -> float:
        return self.tree.get_distance(left_index, right_index)
Beispiel #23
0
    def test_dist(self):
        print "test_dist "
        f = 2
        print "creating object"
        
        i = AnnoyIndex(f, 2, 64, "test_db", 100, 3048576000)
        
        print "creating object"
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        print "creating object"
        self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5))
        print "don"
Beispiel #24
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'dot')
     for j in range(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, numpy.dot(
                 i.get_item_vector(a),
                 i.get_item_vector(b)
             ))
             self.assertEqual(dist, i.get_distance(a, b))
Beispiel #25
0
    def test_dist(self):
        print "test_dist "
        f = 2
        print "creating object"

        i = AnnoyIndex(f, 2, 64, "test_db", 100, 3048576000)

        print "creating object"
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        print "creating object"
        self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2**-0.5))
        print "don"
Beispiel #26
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             # self.assertAlmostEqual(dist, euclidean(u, v))
             self.assertAlmostEqual(dist, numpy.dot(u - v, u - v) ** 0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u, v)])**0.5)
Beispiel #27
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = numpy.array(i.get_item_vector(a))
             v = numpy.array(i.get_item_vector(b))
             # self.assertAlmostEqual(dist, euclidean(u, v))
             self.assertAlmostEqual(dist, numpy.dot(u - v, u - v) ** 0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u, v)])**0.5)
Beispiel #28
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f)
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = i.get_item_vector(a)
             v = i.get_item_vector(b)
             u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5
             v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5
             # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
             self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5)
             # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
Beispiel #29
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f)
     for j in xrange(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = i.get_item_vector(a)
             v = i.get_item_vector(b)
             u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5
             v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5
             # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
             self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5)
             # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
             self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
Beispiel #30
0
def generate_pair(X,
                  n_neighbors,
                  n_MN,
                  n_FP,
                  distance='euclidean',
                  verbose=True):
    '''Generate pairs for the dataset.
    '''
    n, dim = X.shape
    # sample more neighbors than needed
    n_neighbors_extra = min(n_neighbors + 50, n - 1)
    tree = AnnoyIndex(dim, metric=distance)
    if _RANDOM_STATE is not None:
        tree.set_seed(_RANDOM_STATE)
    for i in range(n):
        tree.add_item(i, X[i, :])
    tree.build(20)

    option = distance_to_option(distance=distance)

    nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32)
    knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32)

    for i in range(n):
        nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1)
        nbrs[i, :] = nbrs_[1:]
        for j in range(n_neighbors_extra):
            knn_distances[i, j] = tree.get_distance(i, nbrs[i, j])
    print_verbose("Found nearest neighbor", verbose)
    sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10)
    print_verbose("Calculated sigma", verbose)
    scaled_dist = scale_dist(knn_distances, sig, nbrs)
    print_verbose("Found scaled dist", verbose)
    pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors)
    if _RANDOM_STATE is None:
        pair_MN = sample_MN_pair(X, n_MN, option)
        pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP)
    else:
        pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE, option)
        pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors,
                                               n_FP, _RANDOM_STATE)
    return pair_neighbors, pair_MN, pair_FP, tree
Beispiel #31
0
def KNN_Annoy(X, KK):
    NK = KK
    NN, NF = X.shape
    if KK > NF:
        raise ValueError("KK should be less than 2th-dim of X")

    t = AnnoyIndex(NF, metric='euclidean')
    for i, v in enumerate(X):
        t.add_item(i, v)

    t.build(100)
    ind = []
    val = []

    for i in range(NN):
        closest = t.get_nns_by_item(i, NK)
        ind.append(closest)
        val.append([t.get_distance(i, j) for j in closest])

    return np.array(ind), np.array(val)
Beispiel #32
0
        def _random_nn(X):
            idx = AnnoyIndex(X.shape[1], 'euclidean')
            for i in range(X.shape[0]):
                idx.add_item(i, X[i])

            logging.info("building an index with %d items" % X.shape[0])
            idx.build(50)

            logging.info("finding %d neighbor groups" % self.n_clusters)
            seen = {}
            label = 0

            guess = np.random.randint(X.shape[0])
            centers = {guess: 0}

            while label < self.n_clusters:
                neighbors = idx.get_nns_by_item(guess, _get_num_neighbors())
                for point in neighbors:
                    seen[point] = label
                seen[guess] = label

                # find a distant point
                dists = np.array([[idx.get_distance(i, j) for i in centers]
                                  for j in range(X.shape[0])])

                avg_dists = np.average(dists, axis=1)
                dist_prob = softmax(avg_dists)

                guess = np.random.choice(X.shape[0], p=dist_prob)

                while guess in seen:
                    guess = np.random.choice(X.shape[0], p=dist_prob)
                centers[guess] = label

                label = label + 1

            y = np.zeros(X.shape[0])

            for k, v in seen.items():
                y[k] = v
            return y
Beispiel #33
0
	def find_nearest(self):
		ann = AnnoyIndex(num_merchants)
		for customer in self.customers:
			customer_vector = list(matrix.loc[[customer]])
			ann.add_item(customer, customer_vector)
			if customer%200 == 0:
				print 'Adding '+ str(customer)
		print "Building"
		if len(self.merchantIDs) > max_trees:
			ann.build(max_trees)
		else:
			ann.build(len(self.merchantIDs))
		print "...done"
		for customer in self.customers:
			neighbors = ann.get_nns_by_item(customer, num_neighbors)
			if customer%200 == 0:
				print "Found neighbors for " + str(customer)
			self.nearest[customer] = []
			for neighbor in neighbors:
				if neighbor != customer:
					self.nearest[customer].append((neighbor, ann.get_distance(neighbor, customer)))
Beispiel #34
0
def ann_annoy(data, metric='euclidean',
              n_neighbors=10,
              trees=10):
    """My Approximate Nearest Neighbors function (ANN)
    using the annoy package.

    Parameters
    ----------


    Returns
    -------


    """
    datapoints = data.shape[0]
    dimension = data.shape[1]

    # initialize the annoy database
    ann = AnnoyIndex(dimension)

    # store the datapoints
    for (i, row) in enumerate(data):
        ann.add_item(i, row.tolist())

    # build the index
    ann.build(trees)

    # find the k-nearest neighbors for all points
    idx = np.zeros((datapoints, n_neighbors), dtype='int')
    distVals = idx.copy().astype(np.float)

    # extract the distance values
    for i in range(0, datapoints):
        idx[i,:] = ann.get_nns_by_item(i, n_neighbors)

        for j in range(0, n_neighbors):
            distVals[i,j] = ann.get_distance(i, idx[i,j])

    return distVals, idx
def ann_annoy(data, metric='euclidean',
              n_neighbors=10,
              trees=10):
    """My Approximate Nearest Neighbors function (ANN)
    using the annoy package.

    Parameters
    ----------


    Returns
    -------


    """
    datapoints = data.shape[0]
    dimension = data.shape[1]

    # initialize the annoy database
    ann = AnnoyIndex(dimension)

    # store the datapoints
    for (i, row) in enumerate(data):
        ann.add_item(i, row.tolist())

    # build the index
    ann.build(trees)

    # find the k-nearest neighbors for all points
    idx = np.zeros((datapoints, n_neighbors), dtype='int')
    distVals = idx.copy().astype(np.float)

    # extract the distance values
    for i in range(0, datapoints):
        idx[i,:] = ann.get_nns_by_item(i, n_neighbors)

        for j in range(0, n_neighbors):
            distVals[i,j] = ann.get_distance(i, idx[i,j])

    return distVals, idx
def generate_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test):
    predictions = model.predict(sequences)
    t = AnnoyIndex(len(predictions[0]), metric='euclidean')  # Length of item vector that will be indexed
    t.set_seed(123)
    for i in range(len(predictions)):
        # print(predictions[i])
        v = predictions[i]
        t.add_item(i, v)

    t.build(100) # 100 trees

    match = 0
    no_match = 0
    ann_accuracy = 0
    total = 0

    triplets = {}

    pos_distances = []
    neg_distances = []

    triplets['anchor'] = []
    triplets['positive'] = []
    triplets['negative'] = []

    if test:
        NNlen = TEST_NEIGHBOR_LEN
    else:
        NNlen = TRAIN_NEIGHBOR_LEN

    for key in entity2same:
        index = entity2unique[key]
        nearest = t.get_nns_by_vector(predictions[index], NNlen)
        nearest_text = set([unique_text[i] for i in nearest])
        expected_text = set(entity2same[key])
        # annoy has this annoying habit of returning the queried item back as a nearest neighbor.  Remove it.
        if key in nearest_text:
            nearest_text.remove(key)
        # print("query={} names = {} true_match = {}".format(unique_text[index], nearest_text, expected_text))
        overlap = expected_text.intersection(nearest_text)
        # collect up some statistics on how well we did on the match
        m = len(overlap)
        match += m
        # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!)
        # make sure we adjust our estimate of no match appropriately
        no_match += min(len(expected_text), NNlen - 1) - m

        # sample only the negatives that are true negatives
        # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here'
        # positives = expected_text - nearest_text
        positives = overlap
        negatives = nearest_text - expected_text

        # print(key + str(expected_text) + str(nearest_text))
        for i in negatives:
            for j in positives:
                dist_pos = t.get_distance(index, entity2unique[j])
                pos_distances.append(dist_pos)
                dist_neg = t.get_distance(index, entity2unique[i])
                neg_distances.append(dist_neg)
                if dist_pos < dist_neg:
                    ann_accuracy += 1
                total += 1
                # print(key + "|" +  j + "|" + i)
                # print(dist_pos)
                # print(dist_neg)               

        for i in negatives:
            for j in expected_text:
                triplets['anchor'].append(key)
                triplets['positive'].append(j)
                triplets['negative'].append(i)

    print("mean positive distance:" + str(statistics.mean(pos_distances)))
    print("stdev positive distance:" + str(statistics.stdev(pos_distances)))
    print("max positive distance:" + str(max(pos_distances)))
    print("mean neg distance:" + str(statistics.mean(neg_distances)))
    print("stdev neg distance:" + str(statistics.stdev(neg_distances)))
    print("max neg distance:" + str(max(neg_distances)))
    print("Accuracy in the ANN for triplets that obey the distance func:" + str(ann_accuracy / total))

    obj = {}
    obj['accuracy'] = ann_accuracy / total
    obj['steps'] = 1
    with open(output_file_name_for_hpo, 'w') as out:
        json.dump(obj, out)

    if test:
        return match/(match + no_match)
    else:
        return triplets, match/(match + no_match)
Beispiel #37
0
    tc_index.add_item(i, v)
    i += 1

tc_index.build(10)

# 将这份index存到硬盘
tc_index.save(r'D:\Data_Science\data\Tencent_AILab_ChineseEmbedding\tc_index_build10.index')
'''

#后续读取
with open(
        r'D:\Data_Science\data\Tencent_AILab_ChineseEmbedding\tc_word_index.json',
        'r') as fp:
    word_index = json.load(fp)

tc_index = AnnoyIndex(200, metric='angular')
tc_index.build(10)
tc_index.load(
    r'D:\Data_Science\data\Tencent_AILab_ChineseEmbedding\tc_index_build10.index'
)

# 反向id==>word映射词表
reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])

# get_nns_by_item基于annoy查询词最近的10个向量,返回结果是个list,里面元素是索引
for item in tc_index.get_nns_by_item(word_index[u'卖空'], 10):
    print(reverse_word_index[item])  # 用每个索引查询word

print(tc_index.get_distance(word_index['卫生'], word_index['风险']))  #计算的值是平方距离
#https://github.com/spotify/annoy
Beispiel #38
0
imdb_id_gf1 = '0068646' 
imdb_id_gf2 = '0071562'
imdb_id_ts3 = '0435761'
imdb_id_fpr = '0120815'
imdb_id_fn = '0266543'

# find 10 closest matches
for ann_index in t.get_nns_by_item(imdb_to_index[imdb_id_ts3], 10):
    imdb_id = index_to_imdb[ann_index]
    
    movie_title = cdb.get(str(imdb_id))['title']
    
    print movie_title, t.get_item_vector(ann_index)
    
# distances between some movies
print "GF1 <-> GF2", t.get_distance(imdb_to_index[imdb_id_gf1], imdb_to_index[imdb_id_gf2])
print "GF1 <-> SR", t.get_distance(imdb_to_index[imdb_id_gf1], imdb_to_index[imdb_id_sr])
print "GF2 <-> SR", t.get_distance(imdb_to_index[imdb_id_gf2], imdb_to_index[imdb_id_sr])
print "GF1 <-> TS3", t.get_distance(imdb_to_index[imdb_id_gf1], imdb_to_index[imdb_id_ts3])
print "FPR <-> FN", t.get_distance(imdb_to_index[imdb_id_fpr], imdb_to_index[imdb_id_fn])


with open('closest_matches.csv', 'wb') as f:
    writer = csv.writer(f)
    # find the closest match for each movie
    for imdb_id in imdb_to_index:
        closest_match = t.get_nns_by_item(imdb_to_index[imdb_id], 2)[1]
        distance = t.get_distance(imdb_to_index[imdb_id], closest_match)
        movie1_name = cdb.get(str(imdb_id))['title']
        movie2_name = cdb.get(str(index_to_imdb[closest_match]))['title']
        writer.writerow([imdb_id, movie1_name, index_to_imdb[closest_match], movie2_name, distance])
# ...

all_texts = np.concatenate((texts1, texts2))
match = 0
no_match = 0
print("shape of annoy data1")
print(annoy_data1[0].shape)
print(tr_pairs[:, 0].shape)

for index in range(len(texts1)):
    nearest = t.get_nns_by_vector(mid_predictions[index], 2)
    print("query={} names = {} true_match = {}".format(texts1[index], [all_texts[i] for i in nearest], texts2[index]))
    
    for i in nearest:
        print(t.get_distance(index, i))
        print(model.predict([np.array([annoy_data1[index]]), np.array([annoy_data2[i - len(annoy_data1)]])]))

    print(t.get_distance(index, index + len(texts1)))
    print(model.predict([np.array([annoy_data1[index]]), np.array([annoy_data2[index]])]))

    if (index + len(texts1)) in nearest:
        match += 1
    else:
        no_match += 1
print("match: {} no_match: {}".format(match, no_match))

print("Machine Learning Accuracy")
print(tr_acc)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
Beispiel #40
0
t = AnnoyIndex(100, 'angular')

if __name__ == '__main__':
    counter = 0
    for batch in tqdm(dataloader):
        for _, vec in enumerate(batch):
            t.add_item(counter, vec)
            counter += 1

    # build the tree
    t.build(100)

    # save the tree
    t.save('./trees/test.tree')

    start = timer()
    print(t.get_nns_by_item(0, 10))
    end = timer()
    print('before filter->', end - start)

    start = timer()
    print([
        i for i in t.get_nns_by_item(0, 100)
        if round(t.get_distance(0, i), 2) == 0.0
    ])
    end = timer()
    print('after filter->', end - start)

    # query using a vector
    # print(t.get_nns_by_vector(..., 10))
class AnnoyModel(object):
    def __init__(self,
                 metric_name,
                 n_trees=10,
                 distance_type='angular',
                 load_existing=False):
        """
        Args:
            - metric_name: the name of the metric that vectors in the index will
              represent, a string.
            - n_trees: the number of trees used in building the index, a positive
              integer.
            - distance_type: distance measure, a string. Possibilities are
              "angular", "euclidean", "manhattan", "hamming", or "dot".
            - load_existing: if load_existing is True, then load function will be
              called upon initialization.
        """
        # Check params
        self.parse_initial_params(metric_name, n_trees, distance_type)
        self.dimensionality = db.similarity.get_metric_dimensionality(
            self.metric_name)
        self.index = AnnoyIndex(self.dimensionality, metric=self.distance_type)

        # in_loaded_state set to True if the index is built, loaded, or saved.
        # At any of these points, items can no longer be added to the index.
        self.in_loaded_state = False
        if load_existing:
            self.load()

    def parse_initial_params(self, metric_name, n_trees, distance_type):
        # Validate the index parameters passed to AnnoyModel.
        if metric_name in BASE_INDICES:
            self.metric_name = metric_name
        else:
            raise similarity.exceptions.IndexNotFoundException(
                'Index for specified metric is not possible.')

        if distance_type in BASE_INDICES[self.metric_name]:
            self.distance_type = distance_type
        else:
            raise similarity.exceptions.IndexNotFoundException(
                'Index for specified distance_type is not possible.')

        if n_trees in BASE_INDICES[self.metric_name][self.distance_type]:
            self.n_trees = n_trees
        else:
            raise similarity.exceptions.IndexNotFoundException(
                'Index for specified number of trees is not possible.')

    def build(self):
        """Build and load the index using the specified number of trees. An index
        must be built before it can be queried."""
        n_jobs = current_app.config['SIMILARITY_BUILD_NUM_JOBS']
        self.index.build(n_trees=self.n_trees, n_jobs=n_jobs)
        self.in_loaded_state = True

    def save(self, location=None, name=None):
        # Save and load the index using the metric name.
        if not self.in_loaded_state:
            raise similarity.exceptions.LoadStateException(
                'Index must be built before saving.')
        if not location:
            location = current_app.config['SIMILARITY_INDEX_DIR']
        try:
            os.makedirs(location)
        except OSError:
            if not os.path.isdir(location):
                raise
        name = '_'.join(
            [name or self.metric_name, self.distance_type,
             str(self.n_trees)]) + '.ann'
        file_path = os.path.join(location, name)
        self.index.save(file_path)

    def load(self, name=None):
        """
        Args:
            name: name of the metric that should be loaded. If None, it will use the
            metric specified when initializing the index.
        Raises:
            IndexNotFoundException: if there is no saved index with the given parameters.
        """
        # Load and build an existing annoy index.
        file_path = current_app.config['SIMILARITY_INDEX_DIR']
        name = '_'.join(
            [name or self.metric_name, self.distance_type,
             str(self.n_trees)]) + '.ann'
        full_path = os.path.join(file_path, name)
        try:
            self.index.load(full_path)
            self.in_loaded_state = True
        except IOError:
            raise similarity.exceptions.IndexNotFoundException

    def add_recording_by_mbid(self, mbid, offset):
        """Add a single recording specified by (mbid, offset) to the index.
        Note that when adding a single recording, space is allocated for
        the lowlevel.id + 1 items.
        """
        if self.in_loaded_state:
            raise similarity.exceptions.CannotAddItemException(
                "Item cannot be added once index is in load state.")
        item = db.similarity.get_similarity_by_mbid(mbid, offset)
        if item:
            recording_vector = item[self.metric_name]
            id = item['id']
            # If an item already exists, this should not error
            # and we should not add the item.
            try:
                self.index.get_item_vector(id)
            except IndexError:
                self.index.add_item(id, recording_vector)

    def add_recording_by_id(self, id):
        """Add a single recording specified by its lowlevel.id to the index.
        Note that when adding a single recording, space is allocated for
        lowlevel.id + 1 items.
        """
        if self.in_loaded_state:
            raise similarity.exceptions.CannotAddItemException(
                "Item cannot be added once index is in load state.")
        item = db.similarity.get_similarity_row_id(id)
        if item:
            # If an item already exists, this should not error
            # and we should not add the item.
            try:
                self.index.get_item_vector(id)
            except IndexError:
                self.index.add_item(item["id"], item[self.metric_name])

    def add_recording_with_vector(self, id, vector):
        """Add a single recording to the index using its lowlevel.id and
        a precomputed metric vector.

        Args:
            id: non-negative integer lowlevel.id for a recording.

            vector: metric vector (list) corresponding to the lowlevel.id.
            Dimensionality of the vector must match the dimensionality with which the
            index is initialized

        *NOTE*: Annoy will allocate memory for max(n) + 1 items.
        """
        if self.in_loaded_state:
            raise similarity.exceptions.CannotAddItemException(
                "Item cannot be added once index is in load state.")
        if not len(vector) == self.dimensionality:
            raise similarity.exceptions.CannotAddItemException(
                "Dimensionality of vector provided does not match index dimensionality."
            )

        self.index.add_item(id, vector)

    def get_nns_by_id(self, id, num_neighbours):
        """Get the most similar recordings for a recording with the
           specified id.

        Args:
            id: non-negative integer lowlevel.id for a recording.

            num_neighbours: positive integer, number of similar recordings
            to be returned in the query.

        Returns:
            A list of the form [<lowlevel.ids>, <recordings>, <distances>]
            where <lowlevel.ids> is a list of lowlevel.ids [id_1, ..., id_n],
            <recordings> is a list of tuples (MBID, offset),
            and <distances> is a list of distances, corresponding to each similar recording.
        """
        try:
            items = self.index.get_nns_by_item(id,
                                               num_neighbours,
                                               include_distances=True)
        except IndexError:
            raise similarity.exceptions.ItemNotFoundException(
                'The item you are requesting is not indexed.')

        # Unpack to get ids and distances
        ids = items[0]
        distances = items[1]
        recordings = db.data.get_mbids_by_ids(ids)
        return ids, recordings, distances

    def get_nns_by_mbid(self, mbid, offset, num_neighbours):
        # Find corresponding lowlevel.id to (mbid, offset) combination,
        # then call get_nns_by_id
        lookup = self.get_bulk_nns_by_mbid([(mbid, offset)], num_neighbours)
        return lookup.get(mbid, {}).get(str(offset), [])

    def get_bulk_nns_by_mbid(self, recordings, num_neighbours):
        """Get most similar recordings for each (MBID, offset) tuple provided.
        Similar recordings list returned is ordered with the most similar at
        index 0.

        Arguments:
            recordings: a list of tuples of form (MBID, offset), for which
            similar recordings will be found

            num_neighbours (int): the number of similar recordings desired
            for each recording specified.

        Returns:
            a dictionary of the mbids and offsets given in the recordings parameter.
            Each item is a list of dictionaries, containing the keys recording_mbid, offset, and distance:

                {"mbid1": {"offset1": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...],
                           ...,
                           "offsetn": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...]
                          },
                 ...,
                 "mbidn": {"offset1": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...],
                           ...,
                           "offsetn": [{"recording_mbid": MBID, "offset": offset, "distance": distance}, ...]
                          }
                }
        """
        recordings_info = defaultdict(dict)

        ids = db.data.get_ids_by_mbids(recordings)
        for recording_id, (mbid, offset) in zip(ids, recordings):
            try:
                ids, similar_recordings, distances = self.get_nns_by_id(
                    recording_id, num_neighbours)
                data = []
                for recording, distance in zip(similar_recordings, distances):
                    data.append({
                        'recording_mbid': recording[0],
                        'offset': recording[1],
                        'distance': distance
                    })
                recordings_info[mbid][str(offset)] = data
            except (similarity.exceptions.ItemNotFoundException,
                    db.exceptions.NoDataFoundException):
                continue

        return recordings_info

    def get_similarity_between(self, rec_one, rec_two):
        """Get the distance of the similarity measure between
        two recordings.

        Args:
            rec_one and rec_two are tuples of the form (MBID, offset)

        Returns:
            Distance between two recordings, of type float.
            If an IndexError occurs (one or more of ids is not indexed)
            then None is returned.
        """
        id_1, id_2 = db.data.get_ids_by_mbids([rec_one, rec_two])
        if id_1 is None or id_2 is None:
            return None
        try:
            return self.index.get_distance(id_1, id_2)
        except IndexError:
            return None
Beispiel #42
0
class WordVecSpaceAnnoy(WordVecSpaceDisk):

    N_TREES = 1
    METRIC = "angular"
    ANN_FILE = "vectors.ann"

    def __init__(self,
                 input_dir,
                 n_trees=N_TREES,
                 metric=METRIC,
                 index_fpath=None):
        super().__init__(input_dir)
        self.ann = AnnoyIndex(self.dim, metric=metric)

        self.ann_file = self.J(input_dir, self.ANN_FILE)
        if index_fpath:
            self.ann_file = self.J(index_fpath, self.ANN_FILE)

        self._create_annoy_file(n_trees)

        self.ann.load(self.ann_file)

    def J(self, p1, p2):
        return os.path.join(p1, p2)

    def _create_annoy_file(self, n_trees):
        for i in range(self.nvecs):
            v = self.vecs[i]
            self.ann.add_item(i, v)

        self.ann.build(n_trees)
        self.ann.save(self.ann_file)

    def get_distance(
        self,
        word_or_index1: Union[int, str, np.ndarray],
        word_or_index2: Union[int, str, np.ndarray],
    ):
        v1 = self._check_index_or_word(word_or_index1)
        v2 = self._check_index_or_word(word_or_index2)

        return self.ann.get_distance(v1, v2)

    def get_distances(
        self,
        row_words_or_indices: Union[int, str, np.ndarray],
        col_words_or_indices: Union[int, str, np.ndarray, None] = None,
    ):

        r = row_words_or_indices
        c = col_words_or_indices

        if not isinstance(r, (list, tuple, np.ndarray)):
            r = [r]

        if c:
            if not isinstance(c, (list, tuple, np.ndarray)):
                c = [c]

            mat = self._make_array(shape=((len(r)), len(c)), dtype=np.float32)

            for i, row_word in enumerate(r):
                dist = []
                for col_word in c:
                    dist.append(self.get_distance(row_word, col_word))

                mat[i] = np.asarray(dist, dtype=np.float32)

        else:
            mat = self._make_array(shape=((len(r)), self.nvecs),
                                   dtype=np.float32)
            dist = {}

            for i, row_word in enumerate(r):
                index = self._check_index_or_word(row_word)
                key, val = self.ann.get_nns_by_item(index,
                                                    self.nvecs,
                                                    include_distances=True)

                for k, v in zip(key, val):
                    dist[k] = v

                mat[i] = np.asarray(
                    [dist[key] for key in sorted(dist.keys(), reverse=False)],
                    dtype=np.float32,
                )

        return mat

    DEFAULT_K = 512

    def get_nearest(
        self,
        v_w_i: Union[int, str, np.ndarray],
        k: int = DEFAULT_K,
        combination: bool = False,
    ):
        if isinstance(v_w_i, (tuple, list)):
            res = []
            for word in v_w_i:
                index = self._check_index_or_word(word)

                if index:
                    res.append(self.ann.get_nns_by_item(
                        index, k))  # will find the k nearest neighbors

            # will find common nearest neighbors among given words
            if combination and len(v_w_i) > 1:
                return list(set(res[0]).intersection(*res))

            return res

        index = self._check_index_or_word(v_w_i)

        return self.ann.get_nns_by_item(index, k)
Beispiel #43
0
class Recommender:
    def __init__(self):

        # connect to couchdb
        couch = couchdb.Server()
        couch.resource.credentials = ("imdb", "imdb")
        cdb = couch["filmdings3"]

        # fetch all sentiments
        all_sentiments = cdb.view("filmdings_design/sentiments_for_id")

        self.imdb_to_index = {}
        self.index_to_imdb = {}

        f = 10  # dimensions (= different sentiments)
        self.t = AnnoyIndex(f, metric="angular")

        for i in xrange(len(all_sentiments)):

            row = all_sentiments.rows[i]
            imdb_id = row.key
            s = row.value
            v = [
                s["anger"],
                s["anticipation"],
                s["disgust"],
                s["fear"],
                s["joy"],
                s["negative"],
                s["positive"],
                s["sadness"],
                s["surprise"],
                s["trust"],
            ]

            self.t.add_item(i, v)

            # we have to keep track which index belongs to which imdb id (and vice versa):
            self.imdb_to_index[imdb_id] = i
            self.index_to_imdb[i] = imdb_id

        self.t.build(100)  # number of trees (higher accuracy, but needs more memory)

    def recommend_for_filter(self, s):

        v = [int(s[0]), int(s[1]), int(s[2]), int(s[3]), int(s[4]), 0, 0, int(s[5]), int(s[6]), int(s[7])]

        print v
        movie_ids = []

        for ann_index in self.t.get_nns_by_vector(v, 500):
            imdb_id = self.index_to_imdb[ann_index]
            movie_ids.append(imdb_id)

        return movie_ids

    def recommend(self, movie_answers):

        votes_per_movie = {}

        movies_to_recommend = {}

        for movie in movie_answers:
            print "checking for", movie, "answer=", movie_answers[movie]
            for ann_index in self.t.get_nns_by_item(self.imdb_to_index[movie], 1000):
                imdb_id = self.index_to_imdb[ann_index]
                if not imdb_id in movies_to_recommend:
                    movies_to_recommend[imdb_id] = {}
                    movies_to_recommend[imdb_id]["votes"] = 0
                    movies_to_recommend[imdb_id]["distances"] = []
                    movies_to_recommend[imdb_id]["matched_by"] = []

                if movie_answers[movie] == "yes":
                    movies_to_recommend[imdb_id]["votes"] += 1
                    movies_to_recommend[imdb_id]["matched_by"].append(movie)
                else:
                    movies_to_recommend[imdb_id]["votes"] -= 1

                distance = self.t.get_distance(ann_index, self.imdb_to_index[movie])
                movies_to_recommend[imdb_id]["distances"].append(distance)

        # sort movies by number of votes and average distance
        dicts = movies_to_recommend.items()
        dicts.sort(
            key=lambda (k, d): (d["votes"], 1 / (sum(d["distances"]) / float(len(d["distances"])))), reverse=True
        )

        best_movie_ids = []
        for tupel in dicts[0:50]:
            movie_id = tupel[0]

            # only return movies that are not in movie_answers (user has already rated them, would be stupid to recommend them again)
            if not movie_id in movie_answers:
                best_movie_ids.append(movie_id)

        print best_movie_ids
        return best_movie_ids
class W2V_ANN(Model):
    def __init__(self, config):
        self.requirement = [
            'test_file', 'lastN', 'topN', 'type', 'item_vec_file',
            'index_file_file'
        ]
        self.config = config
        miss = set()
        for item in self.requirement:
            if item not in self.config:
                miss.add(item)
        if len(miss) > 0:
            raise Exception(f"Miss the key : {miss}")

        Model.__init__(self, self.config['test_file'], self.config['lastN'],
                       self.config['topN'])
        self.type = config['type']  # behavior / item
        self.action_w = {'ViewContent': 1.0, 'AddToCart': 3.0, 'revenue': 6.0}

    def train(self):
        b_time = time.time()
        self.item_idx = {}
        self.item_idx_reverse = {}
        tmp_vector = {}
        with open(self.config['item_vec_file'], 'r') as in_f:
            num_items, dim = in_f.readline().strip().split()
            print(f'Num of items : {num_items}, dim : {dim}')
            self.t = AnnoyIndex(int(dim), 'angular')

            for idx, line in tqdm(enumerate(in_f)):
                tmp = line.split()
                if self.type == 'item':
                    try:
                        action, item_org = tmp[0].split(':', 1)
                    except:
                        continue
                else:
                    action = None
                    item_org = tmp[0]

                if item_org not in self.item_idx:
                    self.item_idx[item_org] = idx
                    self.item_idx_reverse[idx] = item_org
                    tmp_vector[idx] = np.zeros(int(dim))
                else:
                    idx = self.item_idx[item_org]

                if self.type == 'item':
                    tmp_vector[idx] += np.array(
                        tmp[1:], dtype=float) * self.action_w[action]
                else:
                    tmp_vector[idx] += np.array(tmp[1:], dtype=float)

        for idx in tmp_vector:
            self.t.add_item(idx, tmp_vector[idx].tolist())
        print("Read file finished ...")
        file_name = self.config['index_file_file'] + '.' + self.type

        self.t.build(30)  # 10 trees
        self.t.save(f'{file_name}.ann')

        # self.t.load(f'{file_name}.ann')

        print(f"Train finished ...{time.time() - b_time}")

    def predict(self, last_n_events, topN):
        b_time = time.time()
        candidate_set = set()
        if self.type == 'item':
            last_n_items = [
                self.item_idx[e.split(':', 1)[1]] for e in last_n_events[::-1]
                if e.split(':', 1)[1] in self.item_idx
            ]
        else:
            last_n_items = [
                self.item_idx[e] for e in last_n_events[::-1]
                if e in self.item_idx
            ]

        if len(last_n_items) == 0:
            return []

        for item_idx in last_n_items:
            candidate = self.__item_topK_similar(item_idx, topN)
            candidate_set.update(candidate)

        candidate_set -= set(last_n_items)
        candidate_list = list(candidate_set)
        score_matric = np.zeros((len(last_n_items), len(candidate_list)))
        for i, item_id in enumerate(last_n_items):
            score_matric[i] = self.__item_item_arr_norm_score(
                item_id, candidate_list)

        rank_weight = np.array(
            [1 / np.log2(rank + 2) for rank in range(len(last_n_items))])
        final_score = rank_weight.dot(score_matric).tolist()
        # print(last_n_items, list(zip(candidate_list, final_score)))
        final_items = sorted(zip(candidate_list, final_score),
                             key=lambda x: x[1],
                             reverse=True)
        # print(f"[Time|Preict] {time.time()-b_time}")

        res = []
        for item, score in final_items:
            try:
                if self.type == 'item':
                    item_raw = self.item_idx_reverse[item]
                else:
                    item_raw = self.item_idx_reverse[item].split(':', 1)[1]

                if item_raw in res:
                    continue
                res.append(item_raw)
            except:
                pass
            if len(res) == topN:
                break
        return res

    def __item_topK_similar(self, given_idx, topK):
        return self.t.get_nns_by_item(given_idx, topK)

    def __item_item_arr_norm_score(self, given_idx, candidate_idx_arr):
        res = [
            1 - self.t.get_distance(given_idx, candidate_idx)
            for candidate_idx in candidate_idx_arr
        ]
        # print(given_idx, sorted(zip(candidate_idx_arr, res), key=lambda x:x[1], reverse=True))
        res = np.array(res)
        return res / np.linalg.norm(res)
class ProbCalculator():
    def __init__(self):
        ## fixme: 静态部分改成离线计算好的dict
        from annoy import AnnoyIndex
        from utils import pickle_load
        self.ann_index = AnnoyIndex(300, metric='euclidean')
        self.ann_index.load('data/index/annoy.euclidean.idx')
        self.id2word = pickle_load('data/index/id2word.pkl')
        self.word2id = pickle_load('data/index/word2id.pkl')

    def calc_probs(self, char, candidates):
        static_probs = self._static_probs(char, candidates)
        probs = static_probs  # 下面的动态prob没卵用
        # if static_probs is not None:
        #   dynamic_probs = self._dynamic_probs(candidates, tokens, idx)
        #   joint_probs = [0.4 * sp + 0.6 * dp for sp, dp in zip(static_probs, dynamic_probs)]
        #   probs_sum = sum(joint_probs)
        #   if probs_sum > 0:
        #     probs = [p / probs_sum for p in joint_probs]
        return probs

    def _static_probs(self, char, candidates):
        """
    计算静态的emb相似度
    :param char:
    :param candidates:
    :return:
    """
        probs = None
        if char in self.word2id:
            # char_vec = np.array(self.ann_index.get_item_vector(self.word2id[char]))
            # unk_sim = 1 / np.sqrt(sum(char_vec ** 2))  # unk相当于vec=np.zeros, 所以dis=|v[char]|, 但在欧式距离下,这个sim会比大部分非unk的词要高
            unk_sim = 0  #

            probs_sum = 0
            probs = []
            for candidate in candidates:
                sim = unk_sim
                if candidate in self.word2id:
                    sim = 1 / self.ann_index.get_distance(
                        self.word2id[char],
                        self.word2id[candidate])  # 上面的代码已经过滤掉了dis=0的情况
                probs.append(sim)
                probs_sum += sim
            if probs_sum > 0:
                probs = [p / probs_sum for p in probs]
            else:
                probs = None
        return probs

    def _dynamic_probs(self, candidates, tokens, idx):
        """
    根据上下文计算candidate对edit distance和jaccard的影响
    :param char:
    :param candidates:
    :param tokens:
    :param idx:
    :return:
    """
        # target_token = tokens[idx]
        # t_set = set(target_token)
        # len_target_token = len(target_token)
        # sentence = ''.join(tokens)
        # probs = []
        # for candidate in candidates:
        #   if len(candidate) == 0:
        #     probs.append(0.5)
        #     continue
        #   c_set = set(candidate)
        #   delta_len = len(c_set) - len(c_set & t_set)
        #   edit_sim = 1 - delta_len / (len(sentence) + 1e-8)
        #   jaccard_sim = len(c_set & set(sentence)) / len(c_set)
        #   sim = 0.4 * edit_sim + 0.6 * jaccard_sim
        #   probs.append(sim)
        probs = []
        s_set = set(''.join(tokens))
        for candidate in candidates:
            prob = 0
            for c in candidate:
                if c in s_set:
                    prob = 1
                    break
            probs.append(prob)
        probs_sum = sum(probs)
        if probs_sum > 0:
            probs = [p / probs_sum for p in probs]
        else:
            probs = [1] * len(candidates)
        return probs
def generate_graph(img, graph_type='grid', d=10, n_tree=10, search_k=-1):
    '''
    assume the image is channel last.
    generate grid graph or nearest neighbor graph
    
    return a list of sorted edge
    '''

    #Arguments:
    #img: A numpy array, channel last
    #graph_type
    #n_tree: larger tree, more precise

    img = _preprocessing(img)

    graphs = []
    rows = img.shape[0]
    cols = img.shape[1]
    num_vertices = rows * cols
    num_edges = (rows - 1) * cols + (cols - 1) * rows

    if graph_type == 'grid':
        #generate grid_graph
        for c in range(img.shape[2]):
            #create edges array
            edges = np.empty((num_edges, 3), dtype=np.float64)

            #insert edge in edges array
            #we can see that each node connect to its right and down node
            #except the most right columns don't have right connection
            #and the bottom row don't have down down connection
            index = 0
            for i in range(rows):
                for j in range(cols):

                    #add edge with current node to its right node
                    #if not the most right columns
                    if j < cols - 1:
                        edges[index][0] = i * cols + j
                        edges[index][1] = i * cols + j + 1
                        edges[index][2] = abs(img[i][j][c] -
                                              img[i][j + 1][c])  #weight
                        index += 1

                    #add edge with current node to its down node
                    #if not the most right columns
                    if i < rows - 1:
                        edges[index][0] = i * cols + j
                        edges[index][1] = (i + 1) * cols + j
                        edges[index][2] = abs(img[i][j][c] -
                                              img[i + 1][j][c])  #weight
                        index += 1

            edges = edges[edges[:, 2].argsort()]
            graphs.append(edges)

    elif graph_type == 'nn':
        #generate nearest neighbor graphs
        #using ANN to find 10 nearest neighbor of each pixel
        #using Annoy library

        f = 5
        t = AnnoyIndex(5, 'euclidean')
        nn_graph = []
        rows = img.shape[0]
        cols = img.shape[1]

        for i in range(rows):
            for j in range(cols):
                v = [img[i, j, 0], img[i, j, 1], img[i, j, 2], i, j]
                t.add_item(i * cols + j, v)

        t.build(n_tree)

        for i in range(rows * cols):
            for neighbor in t.get_nns_by_item(i, d):
                if neighbor > i:
                    nn_graph.append([i, neighbor, t.get_distance(i, neighbor)])
                elif neighbor < i:
                    nn_graph.append([neighbor, i, t.get_distance(i, neighbor)])

        nn_graph = np.array(nn_graph)
        nn_graph = nn_graph[np.unique(nn_graph[:, :2],
                                      axis=0,
                                      return_index=True)[1]]
        graphs.append(nn_graph[nn_graph[:, 2].argsort()])

    else:
        raise ValueError('No such graph type, must be \'grid\' or \'grid\'')

    print('------------------------------------------')
    print(f'{graph_type} type graph construction done')
    print('------------------------------------------')
    return graphs
Beispiel #47
0
 def test_rounding_error(self):
     # https://github.com/spotify/annoy/issues/314
     i = AnnoyIndex(1, 'euclidean')
     i.add_item(0, [0.7125930])
     i.add_item(1, [0.7123166])
     self.assertGreater(i.get_distance(0, 1), 0.0)
Beispiel #48
0
def generate_triplets(X, n_inlier, n_outlier, n_random, fast_trimap = True, weight_adj = False, verbose = True):
    n, dim = X.shape
    if dim > 100:
        X = TruncatedSVD(n_components=100, random_state=0).fit_transform(X)
        dim = 100
    exact = n <= 10000
    n_extra = min(max(n_inlier, 200),n)
    if exact: # do exact knn search
        knn_tree = knn(n_neighbors= n_extra, algorithm='auto').fit(X)
        distances, nbrs = knn_tree.kneighbors(X)
    elif fast_trimap: # use annoy
        tree = AnnoyIndex(dim)
        for i in range(n):
            tree.add_item(i, X[i,:])
        tree.build(50)
        nbrs = np.empty((n,n_extra), dtype=np.int64)
        distances = np.empty((n,n_extra), dtype=np.float64)
        dij = np.empty(n_extra, dtype=np.float64)
        for i in range(n):
            nbrs[i,:] = tree.get_nns_by_item(i, n_extra)
            for j in range(n_extra):
                dij[j] = euclid_dist(X[i,:], X[nbrs[i,j],:])
            sort_indices = np.argsort(dij)
            nbrs[i,:] = nbrs[i,sort_indices]
            # for j in range(n_extra):
            #     distances[i,j] = tree.get_distance(i, nbrs[i,j])
            distances[i,:] = dij[sort_indices]
    else:
        n_bf = 10
        n_extra += n_bf
        knn_tree = knn(n_neighbors= n_bf, algorithm='auto').fit(X)
        _, nbrs_bf = knn_tree.kneighbors(X)
        nbrs = np.empty((n,n_extra), dtype=np.int64)
        nbrs[:,:n_bf] = nbrs_bf
        tree = AnnoyIndex(dim)
        for i in range(n):
            tree.add_item(i, X[i,:])
        tree.build(60)
        distances = np.empty((n,n_extra), dtype=np.float64)
        dij = np.empty(n_extra, dtype=np.float64)
        for i in range(n):
            nbrs[i,n_bf:] = tree.get_nns_by_item(i, n_extra-n_bf)
            unique_nn = np.unique(nbrs[i,:])
            n_unique = len(unique_nn)
            nbrs[i,:n_unique] = unique_nn
            for j in range(n_unique):
                dij[j] = euclid_dist(X[i,:], X[nbrs[i,j],:])
            sort_indices = np.argsort(dij[:n_unique])
            nbrs[i,:n_unique] = nbrs[i,sort_indices]
            distances[i,:n_unique] = dij[sort_indices]
    if verbose:
        print("found nearest neighbors")
    sig = np.maximum(np.mean(distances[:, 10:20], axis=1), 1e-20) # scale parameter
    P = find_p(distances, sig, nbrs)
    triplets = sample_knn_triplets(P, nbrs, n_inlier, n_outlier)
    n_triplets = triplets.shape[0]
    outlier_dist = np.empty(n_triplets, dtype=np.float64)
    if exact or  not fast_trimap:
        for t in range(n_triplets):
            outlier_dist[t] = np.sqrt(np.sum((X[triplets[t,0],:] - X[triplets[t,2],:])**2))
    else:
        for t in range(n_triplets):
            outlier_dist[t] = tree.get_distance(triplets[t,0], triplets[t,2])
    weights = find_weights(triplets, P, nbrs, outlier_dist, sig)
    if n_random > 0:
        rand_triplets = sample_random_triplets(X, n_random, sig)
        rand_weights = rand_triplets[:,-1]
        rand_triplets = rand_triplets[:,:-1].astype(np.int64)
        triplets = np.vstack((triplets, rand_triplets))
        weights = np.hstack((weights, rand_weights))
    weights /= np.max(weights)
    weights += 0.0001
    if weight_adj:
        weights = np.log(1 + 50 * weights)
        weights /= np.max(weights)
    return (triplets, weights)