コード例 #1
0
 def test_with_cache(self):
     cache = libwmdrelax.emd_cache_init(4)
     w1, w2, dist = self._get_w1_w2_dist()
     r = libwmdrelax.emd(w1, w2, dist, cache)
     self.assertAlmostEqual(r, 0.6125115)
     r = libwmdrelax.emd(w1, w2, dist, cache=cache)
     self.assertAlmostEqual(r, 0.6125115)
     libwmdrelax.emd_cache_fini(cache)
コード例 #2
0
        def compute_similarity(self, doc1, doc2, evec=[], single_vector=False):
            """
            Calculates the similarity between two spaCy documents. Extracts the
            nBOW from them and evaluates the WMD.

            :return: The calculated similarity.
            :rtype: float.
            """
            doc1 = self._convert_document(doc1)
            doc2 = self._convert_document(doc2)
            vocabulary = {
                w: i
                for i, w in enumerate(sorted(set(doc1).union(doc2)))
            }
            w1 = self._generate_weights(doc1, vocabulary)
            w2 = self._generate_weights(doc2, vocabulary)

            if not single_vector:
                evec = numpy.zeros(
                    (len(vocabulary), self.nlp.vocab.vectors_length),
                    dtype=numpy.float32)
                for w, i in vocabulary.items():
                    evec[i] = self.nlp.vocab[w].vector
            evec_sqr = (evec * evec).sum(axis=1)
            dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:,
                                                               numpy.newaxis]
            dists[dists < 0] = 0
            dists = numpy.sqrt(dists)
            return libwmdrelax.emd(w1, w2, dists)
コード例 #3
0
def wmd_similarity(span1, span2):
    """ little bit simplified earth mover distance for elmo embeddings.
    Emd solvers take weights and distance matrix.
    With Elmo we don't have 1 vector per werd, so lets treat every word as individual, because they have different vectors.
    The weights vectors so have length of whole vocab, so here both together, and a equal weight for all tokens 1/ len doc.
    The distance matrix

    libwmdrelax.emd(weights1, weights2, distance_matrix)

    :param span1:
    :param span2:
    :return:
    """
    beam = {span1['id']: {span2['id']: 'x'}}
    doc1 = span1['doc']
    doc2 = span2['doc']
    l = len(doc1) + len(doc2)
    w1 = np.zeros(l, dtype=np.float32)
    w1[range(len(doc1))] = 1 / len(doc1)
    w2 = np.zeros(l, dtype=np.float32)
    w2[range(len(doc1), len(doc1) + len(doc2))] = 1 / len(doc2)
    elmo1 = span1['elmo'][2]
    elmo2 = span2['elmo'][2]
    evec = np.vstack((elmo1, elmo2))
    evec_sqr = (evec * evec).sum(axis=1)
    dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, np.newaxis]
    dists[dists < 0] = 0
    dists = np.sqrt(dists)

    try:
        #print (-libwmdrelax.emd(w1, w2, dists))
        return -libwmdrelax.emd(w1, w2, dists), beam
    except RuntimeError:
        print(w1, w2)
        return 0, beam
コード例 #4
0
 def _WMD_batch(self, words1, weights1, i2):
     joint, w1, w2 = self._common_vocabulary_batch(words1, weights1, i2)
     w1 /= w1.sum()
     w2 /= w2.sum()
     evec = self.embeddings[joint]
     evec_sqr = (evec * evec).sum(axis=1)
     dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis]
     dists[dists < 0] = 0
     dists = numpy.sqrt(dists)
     return libwmdrelax.emd(w1, w2, dists, self._exact_cache)
コード例 #5
0
 def _WMD_batch(self, words1, weights1, i2):
     joint, w1, w2 = self._common_vocabulary_batch(words1, weights1, i2)
     w1 /= w1.sum(
     )  # normalize counts into weights by dividing by sum of counts
     w2 /= w2.sum()
     evec = self.embeddings[joint]  # embeddings for words in both sentences
     evec_sqr = (evec * evec).sum(axis=1)
     dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis]
     dists[dists < 0] = 0
     dists = numpy.sqrt(dists)  # euclidean matrix between embeddings
     for i in range(len(dists)):
         dists[i, i] = 0
     return libwmdrelax.emd(w1, w2, dists, self._exact_cache)
コード例 #6
0
 def compute_similarity(self, doc1, doc2):
     doc1 = self._convert_document(doc1)
     doc2 = self._convert_document(doc2)
     vocabulary = {
         w: i
         for i, w in enumerate(sorted(set(doc1).union(doc2)))
     }
     w1 = self._generate_weights(doc1, vocabulary)
     w2 = self._generate_weights(doc2, vocabulary)
     evec = numpy.zeros(
         (len(vocabulary), self.nlp.vocab.vectors_length),
         dtype=numpy.float32)
     for w, i in vocabulary.items():
         evec[i] = self.nlp.vocab[w].vector
     evec_sqr = (evec * evec).sum(axis=1)
     dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:,
                                                        numpy.newaxis]
     dists[dists < 0] = 0
     dists = numpy.sqrt(dists)
     return libwmdrelax.emd(w1, w2, dists)
コード例 #7
0
 def test_no_cache(self):
     w1, w2, dist = self._get_w1_w2_dist()
     r = libwmdrelax.emd(w1, w2, dist)
     self.assertAlmostEqual(r, 0.6125115)
コード例 #8
0
ファイル: __init__.py プロジェクト: GhibliField/wmd-relax
    def nearest_neighbors(self, origin, k=10, early_stop=0.5, max_time=3600,
                          skipped_stop=0.99, throw=True):
        """
        Find the closest samples to the specified one by WMD metric.
        Call :func:`~wmd.WMD.cache_centroids()` beforehand to accelerate the
        first (sorting) stage.

        :param origin: Identifier of the queried sample.
        :param k: The number of nearest neighbors to return.
        :param early_stop: Stop after looking through this ratio of the whole \
                           dataset.
        :param max_time: Maximum time to run. If the runtime exceeds this \
                         threshold, this method stops.
        :param skipped_stop: The stop trigger which is the ratio of samples \
                             which have been skipped thanks to the second \
                             relaxation. The closer to 1, the less chance of \
                             missing an important nearest neighbor.
        :param throw: If true, when an invalid sample is evaluated, an \
                      exception is thrown instead of logging.
        :type origin: suitable for :py:attr:`~wmd.WMD.nbow`
        :type k: int
        :type early_stop: float
        :type max_time: int
        :type skipped_stop: float
        :type throw: bool
        :return: List of tuples, each tuple has length 2. The first element \
                 is a sample identifier, the second is the WMD. This list \
                 is sorted in distance ascending order, so the first tuple is \
                 the closest sample.

        :raises ValueError: if the queried entity has too small vocabulary \
                            (see :py:attr:`~wmd.WMD.vocabulary_min`).
        :raises RuntimeError: if the native code which calculates the EMD fails.
        """
        # origin can be either a text query or an id
        if isinstance(origin, (tuple, list)):
            words, weights = origin
            weights = numpy.array(weights, dtype=numpy.float32)
            if len(words) > self.vocabulary_max:
                words, weights = self.vocabulary_optimizer(
                    words, weights, self.vocabulary_max)
            index = None
            avg = self._get_centroid(words, weights, force=True)
        else:
            index = origin
            words, weights = self._get_vocabulary(index)
            avg = self._get_centroid_by_index(index)
        if avg is None:
            raise ValueError(
                "Too little vocabulary for %s: %d" % (index, len(words)))
        self._log.info("Vocabulary size: %d %d",
                       len(words), self.vocabulary_max)
        self._log.info("WCD")
        ts = time()
        if self._centroid_cache is None:
            queue = []
            for i2 in self.nbow:
                if i2 == index:
                    continue
                d = self._estimate_WMD_centroid_batch(avg, i2)
                if d is not None:
                    queue.append((d, i2))
            queue.sort()
        else:
            keys, centroids = self._centroid_cache
            dists = numpy.linalg.norm(centroids - avg, axis=-1)
            queue = [(None, k) for k in keys[numpy.argsort(dists)]
                     if k is not None]
        self._log.info("%.1f", time() - ts)
        self._log.info("First K WMD")
        ts = time()
        try:
            neighbors = [(-self._WMD_batch(words, weights, i2), i2)
                         for (_, i2) in queue[:k]]
        except RuntimeError as e:
            e.keys = [i2 for (_, i2) in queue[:k]]
            raise e from None
        heapq.heapify(neighbors)
        self._log.info("%s", neighbors[:10])
        self._log.info("%.1f", time() - ts)
        self._log.info("P&P")
        skipped = estimated_d = 0
        ppts = time()
        for progress, (_, i2) in enumerate(queue[k:int(len(queue) * early_stop)]):
            if progress % 10 == 0 \
                    and time() - ppts > self.main_loop_log_interval:
                skipped_ratio = skipped / max(progress, 1)
                self._log.info(
                    "%s %s %s %s %s", progress, skipped_ratio, estimated_d,
                    neighbors[:3], [self.nbow[n[1]][0] for n in neighbors[-3:]])
                ppts = time()
                if ppts - ts > max_time:
                    self._log.info("stopped by max_time condition")
                    break
                if skipped_ratio >= skipped_stop:
                    self._log.info("stopped by skipped_stop condition")
                    break
            estimated_d, w1, w2, dists = self._estimate_WMD_relaxation_batch(
                words, weights, i2)
            farthest = -neighbors[0][0]
            if farthest == 0:
                self._log.info("stopped by farthest == 0 condition")
                break
            if estimated_d >= farthest:
                skipped += 1
                continue
            try:
                d = libwmdrelax.emd(w1, w2, dists, self._exact_cache)
            except RuntimeError as e:
                if throw:
                    e.w1 = w1
                    e.w2 = w2
                    e.dists = dists
                    e.key = i2
                    raise e from None
                else:
                    self._log.error("#%s: %s", i2, e)
                    continue
            if d < farthest:
                heapq.heapreplace(neighbors, (-d, i2))
        else:
            self._log.info("stopped by early_stop condition")
        neighbors = [(-n[0], n[1]) for n in neighbors]
        neighbors.sort()
        return [(n[1], n[0]) for n in neighbors]
コード例 #9
0
 def nearest_neighbors(self,
                       origin,
                       k=10,
                       early_stop=0.5,
                       max_time=3600,
                       skipped_stop=0.999):
     if isinstance(origin, (tuple, list)):
         words, weights = origin
         index = None
         avg = self._get_centroid(words, weights)
     else:
         index = origin
         words, weights = self._get_vocabulary(index)
         avg = self._get_centroid_by_index(index)
     if avg is None:
         raise ValueError("Too little vocabulary for %s: %d" %
                          (index, len(words)))
     self._log.info("Vocabulary size: %d %d", len(words),
                    self.vocabulary_max)
     self._log.info("WCD")
     ts = time()
     if self._centroid_cache is None:
         queue = []
         for i2 in self.nbow:
             if i2 == index:
                 continue
             d = self._estimate_WMD_centroid_batch(avg, i2)
             if d is not None:
                 queue.append((d, i2))
         queue.sort()
     else:
         keys, centroids = self._centroid_cache
         dists = numpy.linalg.norm(centroids - avg, axis=-1)
         queue = [(None, k) for k in keys[numpy.argsort(dists)]
                  if k is not None]
     self._log.info("%.1f", time() - ts)
     self._log.info("First K WMD")
     ts = time()
     neighbors = [(-self._WMD_batch(words, weights, i2), i2)
                  for (_, i2) in queue[:k]]
     heapq.heapify(neighbors)
     self._log.info("%s", neighbors[:10])
     self._log.info("%.1f", time() - ts)
     self._log.info("P&P")
     skipped = estimated_d = 0
     ppts = time()
     for progress, (_,
                    i2) in enumerate(queue[k:int(len(queue) * early_stop)]):
         if progress % 10 == 0 \
                 and time() - ppts > self.main_loop_log_interval:
             skipped_ratio = skipped / max(progress, 1)
             self._log.info("%s %s %s %s %s", progress, skipped_ratio,
                            estimated_d, neighbors[:3],
                            [self.nbow[n[1]][0] for n in neighbors[-3:]])
             ppts = time()
             if ppts - ts > max_time:
                 self._log.info("stopped by max_time condition")
                 break
             if skipped_ratio >= skipped_stop:
                 self._log.info("stopped by skipped_stop condition")
                 break
         estimated_d, w1, w2, dists = self._estimate_WMD_relaxation_batch(
             words, weights, i2)
         farthest = -neighbors[0][0]
         if farthest == 0:
             self._log.info("stopped by farthest == 0 condition")
             break
         if estimated_d >= farthest:
             skipped += 1
             continue
         d = libwmdrelax.emd(w1, w2, dists, self._exact_cache)
         if d < farthest:
             heapq.heapreplace(neighbors, (-d, i2))
     else:
         self._log.info("stopped by early_stop condition")
     neighbors = [(-n[0], n[1]) for n in neighbors]
     neighbors.sort()
     return [(n[1], n[0]) for n in neighbors]