def test_with_cache(self): cache = libwmdrelax.emd_cache_init(4) w1, w2, dist = self._get_w1_w2_dist() r = libwmdrelax.emd(w1, w2, dist, cache) self.assertAlmostEqual(r, 0.6125115) r = libwmdrelax.emd(w1, w2, dist, cache=cache) self.assertAlmostEqual(r, 0.6125115) libwmdrelax.emd_cache_fini(cache)
def compute_similarity(self, doc1, doc2, evec=[], single_vector=False): """ Calculates the similarity between two spaCy documents. Extracts the nBOW from them and evaluates the WMD. :return: The calculated similarity. :rtype: float. """ doc1 = self._convert_document(doc1) doc2 = self._convert_document(doc2) vocabulary = { w: i for i, w in enumerate(sorted(set(doc1).union(doc2))) } w1 = self._generate_weights(doc1, vocabulary) w2 = self._generate_weights(doc2, vocabulary) if not single_vector: evec = numpy.zeros( (len(vocabulary), self.nlp.vocab.vectors_length), dtype=numpy.float32) for w, i in vocabulary.items(): evec[i] = self.nlp.vocab[w].vector evec_sqr = (evec * evec).sum(axis=1) dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis] dists[dists < 0] = 0 dists = numpy.sqrt(dists) return libwmdrelax.emd(w1, w2, dists)
def wmd_similarity(span1, span2): """ little bit simplified earth mover distance for elmo embeddings. Emd solvers take weights and distance matrix. With Elmo we don't have 1 vector per werd, so lets treat every word as individual, because they have different vectors. The weights vectors so have length of whole vocab, so here both together, and a equal weight for all tokens 1/ len doc. The distance matrix libwmdrelax.emd(weights1, weights2, distance_matrix) :param span1: :param span2: :return: """ beam = {span1['id']: {span2['id']: 'x'}} doc1 = span1['doc'] doc2 = span2['doc'] l = len(doc1) + len(doc2) w1 = np.zeros(l, dtype=np.float32) w1[range(len(doc1))] = 1 / len(doc1) w2 = np.zeros(l, dtype=np.float32) w2[range(len(doc1), len(doc1) + len(doc2))] = 1 / len(doc2) elmo1 = span1['elmo'][2] elmo2 = span2['elmo'][2] evec = np.vstack((elmo1, elmo2)) evec_sqr = (evec * evec).sum(axis=1) dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, np.newaxis] dists[dists < 0] = 0 dists = np.sqrt(dists) try: #print (-libwmdrelax.emd(w1, w2, dists)) return -libwmdrelax.emd(w1, w2, dists), beam except RuntimeError: print(w1, w2) return 0, beam
def _WMD_batch(self, words1, weights1, i2): joint, w1, w2 = self._common_vocabulary_batch(words1, weights1, i2) w1 /= w1.sum() w2 /= w2.sum() evec = self.embeddings[joint] evec_sqr = (evec * evec).sum(axis=1) dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis] dists[dists < 0] = 0 dists = numpy.sqrt(dists) return libwmdrelax.emd(w1, w2, dists, self._exact_cache)
def _WMD_batch(self, words1, weights1, i2): joint, w1, w2 = self._common_vocabulary_batch(words1, weights1, i2) w1 /= w1.sum( ) # normalize counts into weights by dividing by sum of counts w2 /= w2.sum() evec = self.embeddings[joint] # embeddings for words in both sentences evec_sqr = (evec * evec).sum(axis=1) dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis] dists[dists < 0] = 0 dists = numpy.sqrt(dists) # euclidean matrix between embeddings for i in range(len(dists)): dists[i, i] = 0 return libwmdrelax.emd(w1, w2, dists, self._exact_cache)
def compute_similarity(self, doc1, doc2): doc1 = self._convert_document(doc1) doc2 = self._convert_document(doc2) vocabulary = { w: i for i, w in enumerate(sorted(set(doc1).union(doc2))) } w1 = self._generate_weights(doc1, vocabulary) w2 = self._generate_weights(doc2, vocabulary) evec = numpy.zeros( (len(vocabulary), self.nlp.vocab.vectors_length), dtype=numpy.float32) for w, i in vocabulary.items(): evec[i] = self.nlp.vocab[w].vector evec_sqr = (evec * evec).sum(axis=1) dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, numpy.newaxis] dists[dists < 0] = 0 dists = numpy.sqrt(dists) return libwmdrelax.emd(w1, w2, dists)
def test_no_cache(self): w1, w2, dist = self._get_w1_w2_dist() r = libwmdrelax.emd(w1, w2, dist) self.assertAlmostEqual(r, 0.6125115)
def nearest_neighbors(self, origin, k=10, early_stop=0.5, max_time=3600, skipped_stop=0.99, throw=True): """ Find the closest samples to the specified one by WMD metric. Call :func:`~wmd.WMD.cache_centroids()` beforehand to accelerate the first (sorting) stage. :param origin: Identifier of the queried sample. :param k: The number of nearest neighbors to return. :param early_stop: Stop after looking through this ratio of the whole \ dataset. :param max_time: Maximum time to run. If the runtime exceeds this \ threshold, this method stops. :param skipped_stop: The stop trigger which is the ratio of samples \ which have been skipped thanks to the second \ relaxation. The closer to 1, the less chance of \ missing an important nearest neighbor. :param throw: If true, when an invalid sample is evaluated, an \ exception is thrown instead of logging. :type origin: suitable for :py:attr:`~wmd.WMD.nbow` :type k: int :type early_stop: float :type max_time: int :type skipped_stop: float :type throw: bool :return: List of tuples, each tuple has length 2. The first element \ is a sample identifier, the second is the WMD. This list \ is sorted in distance ascending order, so the first tuple is \ the closest sample. :raises ValueError: if the queried entity has too small vocabulary \ (see :py:attr:`~wmd.WMD.vocabulary_min`). :raises RuntimeError: if the native code which calculates the EMD fails. """ # origin can be either a text query or an id if isinstance(origin, (tuple, list)): words, weights = origin weights = numpy.array(weights, dtype=numpy.float32) if len(words) > self.vocabulary_max: words, weights = self.vocabulary_optimizer( words, weights, self.vocabulary_max) index = None avg = self._get_centroid(words, weights, force=True) else: index = origin words, weights = self._get_vocabulary(index) avg = self._get_centroid_by_index(index) if avg is None: raise ValueError( "Too little vocabulary for %s: %d" % (index, len(words))) self._log.info("Vocabulary size: %d %d", len(words), self.vocabulary_max) self._log.info("WCD") ts = time() if self._centroid_cache is None: queue = [] for i2 in self.nbow: if i2 == index: continue d = self._estimate_WMD_centroid_batch(avg, i2) if d is not None: queue.append((d, i2)) queue.sort() else: keys, centroids = self._centroid_cache dists = numpy.linalg.norm(centroids - avg, axis=-1) queue = [(None, k) for k in keys[numpy.argsort(dists)] if k is not None] self._log.info("%.1f", time() - ts) self._log.info("First K WMD") ts = time() try: neighbors = [(-self._WMD_batch(words, weights, i2), i2) for (_, i2) in queue[:k]] except RuntimeError as e: e.keys = [i2 for (_, i2) in queue[:k]] raise e from None heapq.heapify(neighbors) self._log.info("%s", neighbors[:10]) self._log.info("%.1f", time() - ts) self._log.info("P&P") skipped = estimated_d = 0 ppts = time() for progress, (_, i2) in enumerate(queue[k:int(len(queue) * early_stop)]): if progress % 10 == 0 \ and time() - ppts > self.main_loop_log_interval: skipped_ratio = skipped / max(progress, 1) self._log.info( "%s %s %s %s %s", progress, skipped_ratio, estimated_d, neighbors[:3], [self.nbow[n[1]][0] for n in neighbors[-3:]]) ppts = time() if ppts - ts > max_time: self._log.info("stopped by max_time condition") break if skipped_ratio >= skipped_stop: self._log.info("stopped by skipped_stop condition") break estimated_d, w1, w2, dists = self._estimate_WMD_relaxation_batch( words, weights, i2) farthest = -neighbors[0][0] if farthest == 0: self._log.info("stopped by farthest == 0 condition") break if estimated_d >= farthest: skipped += 1 continue try: d = libwmdrelax.emd(w1, w2, dists, self._exact_cache) except RuntimeError as e: if throw: e.w1 = w1 e.w2 = w2 e.dists = dists e.key = i2 raise e from None else: self._log.error("#%s: %s", i2, e) continue if d < farthest: heapq.heapreplace(neighbors, (-d, i2)) else: self._log.info("stopped by early_stop condition") neighbors = [(-n[0], n[1]) for n in neighbors] neighbors.sort() return [(n[1], n[0]) for n in neighbors]
def nearest_neighbors(self, origin, k=10, early_stop=0.5, max_time=3600, skipped_stop=0.999): if isinstance(origin, (tuple, list)): words, weights = origin index = None avg = self._get_centroid(words, weights) else: index = origin words, weights = self._get_vocabulary(index) avg = self._get_centroid_by_index(index) if avg is None: raise ValueError("Too little vocabulary for %s: %d" % (index, len(words))) self._log.info("Vocabulary size: %d %d", len(words), self.vocabulary_max) self._log.info("WCD") ts = time() if self._centroid_cache is None: queue = [] for i2 in self.nbow: if i2 == index: continue d = self._estimate_WMD_centroid_batch(avg, i2) if d is not None: queue.append((d, i2)) queue.sort() else: keys, centroids = self._centroid_cache dists = numpy.linalg.norm(centroids - avg, axis=-1) queue = [(None, k) for k in keys[numpy.argsort(dists)] if k is not None] self._log.info("%.1f", time() - ts) self._log.info("First K WMD") ts = time() neighbors = [(-self._WMD_batch(words, weights, i2), i2) for (_, i2) in queue[:k]] heapq.heapify(neighbors) self._log.info("%s", neighbors[:10]) self._log.info("%.1f", time() - ts) self._log.info("P&P") skipped = estimated_d = 0 ppts = time() for progress, (_, i2) in enumerate(queue[k:int(len(queue) * early_stop)]): if progress % 10 == 0 \ and time() - ppts > self.main_loop_log_interval: skipped_ratio = skipped / max(progress, 1) self._log.info("%s %s %s %s %s", progress, skipped_ratio, estimated_d, neighbors[:3], [self.nbow[n[1]][0] for n in neighbors[-3:]]) ppts = time() if ppts - ts > max_time: self._log.info("stopped by max_time condition") break if skipped_ratio >= skipped_stop: self._log.info("stopped by skipped_stop condition") break estimated_d, w1, w2, dists = self._estimate_WMD_relaxation_batch( words, weights, i2) farthest = -neighbors[0][0] if farthest == 0: self._log.info("stopped by farthest == 0 condition") break if estimated_d >= farthest: skipped += 1 continue d = libwmdrelax.emd(w1, w2, dists, self._exact_cache) if d < farthest: heapq.heapreplace(neighbors, (-d, i2)) else: self._log.info("stopped by early_stop condition") neighbors = [(-n[0], n[1]) for n in neighbors] neighbors.sort() return [(n[1], n[0]) for n in neighbors]