def test_emd_validate_larger_signatures_1(): first_signature = np.array([0.0, 1.0, 2.0]) second_signature = np.array([5.0, 3.0, 3.0]) distance_matrix = np.array([[0.0, 0.5], [0.5, 0.0]]) with pytest.raises(ValueError): emd(first_signature, second_signature, distance_matrix)
def test_symmetric_distance_matrix(): first_signature = np.array([0.0, 1.0]) second_signature = np.array([5.0, 3.0]) distance_matrix = np.array([[0.0, 0.5, 3.0], [0.5, 0.0]]) with pytest.raises(ValueError): emd(first_signature, second_signature, distance_matrix)
def test_error_wrong_distance_matrix_ndim(self): first_signature = np.array([6.0, 1.0]) second_signature = np.array([1.0, 7.0]) distance_matrix = np.array([[[0.0, 1.0], [1.0, 0.0]]]) with self.assertRaises(ValueError): emd(first_signature, second_signature, distance_matrix)
def test_error_different_signature_lengths(self): first_signature = np.array([6.0, 1.0, 9.0]) second_signature = np.array([1.0, 7.0]) distance_matrix = np.array([[0.0, 1.0], [1.0, 0.0]]) with self.assertRaises(ValueError): emd(first_signature, second_signature, distance_matrix)
def test_emd_validate_different_signature_dims(): first_signature = np.array([0.0, 1.0]) second_signature = np.array([5.0, 3.0, 3.0]) distance_matrix = np.array([[0.0, 0.5, 0.0], [0.5, 0.0, 0.0], [0.5, 0.0, 0.0]]) with pytest.raises(ValueError): emd(first_signature, second_signature, distance_matrix)
def calc_wmd(d1, d2, dm, vob_index_dict): u1 = set(d1) u2 = set(d2) du = u1.union(u2) f1 = np.array(nBOW(d1, du)) f2 = np.array(nBOW(d2, du)) dul = len(du) dum = np.zeros((dul, dul), dtype=np.float) du_list = list(du) processed_list = [] for i, t1 in enumerate(du_list): processed_list.append(i) for j, t2 in enumerate(du_list): if j in processed_list: continue dist_matrix_x = vob_index_dict[t1] dist_matrix_y = vob_index_dict[t2] dist = dm[dist_matrix_x, dist_matrix_y] dum[i][j] = dist dum[j][i] = dist return emd(f1, f2, dum)
def score_word2vec_wmd(src, dst, wv): b1 = [] b2 = [] lines = 0 with open(src) as p: for i, line in enumerate(p): s = line.split('\t') b1.append(s[0]) b2.append(s[1][:-1]) #remove \n lines = i + 1 vectorizer = CountVectorizer() vectors=vectorizer.fit_transform(b1 + b2) common = [word for word in vectorizer.get_feature_names() if word in wv] W_common = [wv[w] for w in common] vectorizer = CountVectorizer(vocabulary=common, dtype=np.double) b1_v = vectorizer.transform(b1) b2_v = vectorizer.transform(b2) D_ = sklearn.metrics.euclidean_distances(W_common) D_ = D_.astype(np.double) D_ /= D_.max() b1_vecs = b1_v.toarray() b2_vecs = b1_v.toarray() b1_vecs /= b1_v.sum() b2_vecs /= b2_v.sum() b1_vecs = b1_vecs.astype(np.double) b2_vecs = b2_vecs.astype(np.double) res = [round(emd(b1_vecs[i], b2_vecs[i], D_),2) for i in range(lines)] with open(dst, 'w') as thefile: thefile.write("\n".join(str(i) for i in res)) print src + ' finished!'
def wordMoverDistance(d1, d2): ###d1 list ###d2 list # Rule out words that not in vocabulary d1 = " ".join([w for w in d1 if w in vocab_dict]) d2 = " ".join([w for w in d2 if w in vocab_dict]) #print d1 #print d2 vect = CountVectorizer().fit([d1,d2]) feature_names = vect.get_feature_names() W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix D_ = euclidean_distances(W_) # Distance Matrix D_ = D_.astype(np.double) #D_ /= D_.max() # Normalize for comparison v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() ### EMD v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 /= v_1.sum() v_2 /= v_2.sum() #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_))) emd_d = emd(v_1, v_2, D_) ## WMD #print emd_d return emd_d
def test_emd_3(): first_signature = np.array([6.0, 1.0]) second_signature = np.array([1.0, 7.0]) distance_matrix = np.array([[0.0, 0.0], [0.0, 0.0]]) emd_assert( emd(first_signature, second_signature, distance_matrix), 0.0 )
def test_emd_1(): first_signature = np.array([0.0, 1.0]) second_signature = np.array([5.0, 3.0]) distance_matrix = np.array([[0.0, 0.5], [0.5, 0.0]]) emd_assert( emd(first_signature, second_signature, distance_matrix), 3.5 )
def _wh_ne_distance(self, other, w): c1 = getattr(self, w) c2 = getattr(other, w) if not len(c1) or not len(c2): # one of them has nothing to compare; distance is np.nan return np.nan s1 = sorted(c1.keys(), key=lambda k: c1[k], reverse=True) s2 = sorted(c2.keys(), key=lambda k: c2[k], reverse=True) if self.max_nes > 0: penalty = max( sum( c1[w] for w in s1[self.max_nes:] ), sum( c2[w] for w in s2[self.max_nes:] ) ) s1 = s1[:self.max_nes] s2 = s2[:self.max_nes] else: penalty = 0 # penalty will make up for those documents that have low-scoring # NEs, meaning they should not be compared with other news items # since this method would not have meaning with them matrix, nes = NE.matrix(set(s1).union(set(s2))) if not nes: # Not a single NE to compare; distance is np.nan return np.nan nes = [ne.lower() for ne in nes] # NE.matrix returns Titles v1 = np.array([ c1[ne] for ne in nes ]) v2 = np.array([ c2[ne] for ne in nes ]) # Make it sum 1 s = v1.sum() if s > 0: v1 /= s s = v2.sum() if s > 0: v2 /= s # Now compute emd of the two vectors. # That distance is in [0, 1] # By multiplying per (1 - penalty) and adding penalty, # you ensure distance is in [penalty, 1], # penalty being the maximum uncertainty there is in each of the vectors. return (1 - penalty) * emd(v1, v2, matrix) + penalty
def hamming_emd(d1, d2): """Return the Earth Mover's Distance between two distributions (indexed by state, one dimension per node). Singleton dimensions are sqeezed out. """ d1, d2 = d1.squeeze(), d2.squeeze() # Compute the EMD with Hamming distance between states as the # transportation cost function. return emd(d1.ravel(), d2.ravel(), _hamming_matrix(d1.ndim))
def hamming_emd(d1, d2): """Return the Earth Mover's Distance between two distributions (indexed by state, one dimension per node) using the Hamming distance between states as the transportation cost function. Singleton dimensions are sqeezed out. """ N = d1.squeeze().ndim d1, d2 = flatten(d1), flatten(d2) return emd(d1, d2, _hamming_matrix(N))
def dist_hist(X,Y,distance_matrices) : start=0 size=0 l=[] for M in distance_matrices : size=M.shape[0] l.append(emd(X[start:(start+size)],Y[start:(start+size)],M)) start+=size return np.linalg.norm(l)
def dist_hist_withoutnullhist(X,Y,distance_matrices) : start=0 size=0 l=[] for M in distance_matrices : size=M.shape[0] if sum(X[start:(start+size)]) != 0.0 and sum(Y[start:(start+size)]) != 0.0 : l.append(emd(X[start:(start+size)],Y[start:(start+size)],M)) start+=size return np.linalg.norm(l)
def _wmd(self, i, row, X_train): """Compute the WMD between training sample i and given test row. Assumes that `row` and train samples are sparse BOW vectors summing to 1. """ union_idx = np.union1d(X_train[i].indices, row.indices) - 1 W_minimal = self.W_embed[union_idx] W_dist = euclidean_distances(W_minimal) bow_i = X_train[i, union_idx].A.ravel() bow_j = row[:, union_idx].A.ravel() return emd(bow_i, bow_j, W_dist)
def test_emd_extra_mass_penalty(): first_signature = np.array([0.0, 2.0, 1.0, 2.0]) second_signature = np.array([2.0, 1.0, 2.0, 1.0]) distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0], [1.0, 0.0, 2.0, 1.0], [1.0, 2.0, 0.0, 1.0], [2.0, 1.0, 1.0, 0.0]]) emd_assert( emd(first_signature, second_signature, distance_matrix, extra_mass_penalty=2.5), 4.5 )
def hist_emd(reference_hist_df, compare_hist_df, key, distance_matrix=None): #Merge the two columns on the union of delays merged_df = pd.merge(reference_hist_df, compare_hist_df, how='outer', left_index=True, right_index=True) merged_df.fillna(0., inplace=True) #Treat missing values as zero ref_merged_key = key + '_x' comp_merged_key = key + '_y' if distance_matrix == None: #Unspecified, calculate distance_matrix = calc_distance_matrix(merged_df.index, merged_df.index) return emd(merged_df[ref_merged_key].values, merged_df[comp_merged_key].values, distance_matrix)
def __sub__(self, other): """ Earth-mover's distance (EMD) between two histograms. Calculated for channels separately and summed up. """ result = sum([ emd( pair[0].astype(np.float), pair[1].astype(np.float), Histogram._L1_DISTANCE_MATRIX ) for pair in zip(self.channels, other.channels) ]) return result
def word_movers(doc1, doc2, metric='cosine'): """ Measure the semantic similarity between two documents using Word Movers Distance. Args: doc1 (``textacy.Doc`` or ``spacy.Doc``) doc2 (``textacy.Doc`` or ``spacy.Doc``) metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'}) Returns: float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0], where larger values correspond to more similar documents References: Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances," in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009. Kusner, Matt J., et al. "From word embeddings to document distances." Proceedings of the 32nd International Conference on Machine Learning (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf """ stringstore = StringStore() n = 0 word_vecs = [] for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)): if word.has_vector: if stringstore[word.text] - 1 == n: # stringstore[0] always empty space word_vecs.append(word.vector) n += 1 distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(np.double) distance_mat /= distance_mat.max() vec1 = collections.Counter( stringstore[word.text] - 1 for word in extract.words(doc1) if word.has_vector) vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))]).astype(np.double) vec1 /= vec1.sum() # normalize word counts vec2 = collections.Counter( stringstore[word.text] - 1 for word in extract.words(doc2) if word.has_vector) vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))]).astype(np.double) vec2 /= vec2.sum() # normalize word counts return 1.0 - emd(vec1, vec2, distance_mat)
def wordMoversDistance(self, s1, s2): vect = CountVectorizer(stop_words="english").fit([s1, s2]) v_1, v_2 = vect.transform([s1, s2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() W_ = self.W[[self.vocab_dict[w] if w in self.vocab_dict else self.vocab_dict[self.vocab_dict.keys()[0]] for w in vect.get_feature_names()]] D_ = euclidean_distances(W_) v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) D_ = D_.astype(np.double) return emd(v_1, v_2, D_)
def emd_distance(x, y, distance_scaling=1.0): support_size = max(len(x), len(y)) d_mat = toeplitz(range(support_size)).astype(np.float) distance_mat = d_mat / distance_scaling # convert histogram values x and y to float, and make them equal len x = x.astype(np.float) y = y.astype(np.float) if len(x) < len(y): x = np.hstack((x, [0.0] * (support_size - len(x)))) elif len(y) < len(x): y = np.hstack((y, [0.0] * (support_size - len(y)))) emd = pyemd.emd(x, y, distance_mat) return emd
def distance(self, stat, distance_type): assert(isinstance(stat, Histogram)) if distance_type == DistanceType.EARTH_MOVER: bin_locs = np.mean([self.bin_edges[:-1], self.bin_edges[1:]], axis=0) bins = len(bin_locs) distance_matrix = np.abs(np.repeat(bin_locs, bins) - np.tile(bin_locs, bins)) distance_matrix = distance_matrix.reshape(bins, bins) assert(len(distance_matrix) == len(distance_matrix[0])) assert(self.data.shape[0] <= len(distance_matrix)) assert(stat.data.shape[0] <= len(distance_matrix)) return emd(self.data.astype(np.float64), stat.data.astype(np.float64), distance_matrix.astype(np.float64)) else: return Distrib.distance(self, stat, distance_type)
def earth_movers_distance(distance_matrix, image1, image2): """Returns Earth Mover's Distance for image1 and image2. distance_matrix is an N x N distance matrix where N = x * y * z where the shape of image1 and image2 are (x, y, z). distance_matrix[i][j] gives the distance between the ith and jth element of an unraveled image. See numpy.ravel() for details on how a three dimensional array is converted to a one dimensional array. """ # turn voxel activations into probability distributions image1, image2 = [np.clip(img, 0, 999) for img in (image1, image2)] image1, image2 = [img / np.sum(img) for img in (image1, image2)] result = pyemd.emd(image1.ravel(), image2.ravel(), distance_matrix) return result
def wmd(s1, s2): vect = CountVectorizer(stop_words="english").fit([s1, s2]) v_1, v_2 = vect.transform([s1, s2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() W_ = W[[vocab_dict[w] if w in vocab_dict else vocab_dict[vocab_dict.keys()[0]] for w in vect.get_feature_names()]] D_ = euclidean_distances(W_) # pyemd needs double precision input v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) D_ = D_.astype(np.double) return emd(v_1, v_2, D_)
def similarity(doc1, doc2): """ These code is from http://vene.ro/blog/word-movers-distance-in-python.html """ W, vocab_dict = wordEmbedding() _doc1 = doc1 _doc2 = doc2 vect = CountVectorizer(stop_words="english").fit([_doc1, _doc2]) # print("Features:", ", ".join(vect.get_feature_names())) # It seems like some specific number is missing in voca_dict. # Just simply ingore them # Word check... Then no word check... newFeature = list() for w in vect.get_feature_names(): try: vocab_dict[unidecode(w)] newFeature.append(unidecode(w)) except KeyError: pass v_1 = [0 for w in newFeature] v_2 = [0 for w in newFeature] for wNo, w in enumerate(newFeature): if w in _doc1: v_1[wNo] = 1 elif w in _doc2: v_2[wNo] = 1 W_ = W[[vocab_dict[w] for w in newFeature]] D_ = euclidean_distances(W_) v_1 = numpy.asarray(v_1) v_2 = numpy.asarray(v_2) v_1 = v_1.astype(numpy.double) v_2 = v_2.astype(numpy.double) v_1 /= v_1.sum() v_2 /= v_2.sum() D_ = D_.astype(numpy.double) D_ /= D_.max() return emd(v_1, v_2, D_)
def WMD(document1, document2, model): ''' Compute WMD. Input: document1: List of words. document2: List of words. model: Word2vec model, providing the word embeddings. vocab: Set of words in all documents. Returns: WMD between documents, float. ''' # Remove out-of-vocabulary words. len_pre_oov1 = len(document1) len_pre_oov2 = len(document2) document1 = [token for token in document1 if token in model.vocab.keys()] document2 = [token for token in document2 if token in model.vocab.keys()] logging.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', len_pre_oov1 - len(document1), len_pre_oov2 - len(document2)) if len(document1) == 0 or len(document2) == 0: logging.info('At least one of the documents had no words that were in the vocabulary. Aborting (returning NaN).') return float('nan') vocab = set(document1 + document2) # Compute nBOW representation of documents. d1 = np.array(nBOW(document1, vocab)) d2 = np.array(nBOW(document2, vocab)) vocab_len = len(vocab) distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.float) for i, t1 in enumerate(vocab): for j, t2 in enumerate(vocab): if not t1 in document1 or not t2 in document2: # Only compute the distances that we need. continue # Compute Euclidean distance between word vectors. # TODO: this matrix is (and should be) symmetric, so we can save some computation here. # TODO: why not cosine distance? distance_matrix[i][j] = np.sqrt(np.sum((model[t1] - model[t2])**2)) # Return WMD. return emd(d1, d2, distance_matrix)
def gaussian_emd(x, y, sigma=1.0, distance_scaling=1.0): ''' Gaussian kernel with squared distance in exponential term replaced by EMD Args: x, y: 1D pmf of two distributions with the same support sigma: standard deviation ''' support_size = max(len(x), len(y)) d_mat = toeplitz(range(support_size)).astype(np.float) distance_mat = d_mat / distance_scaling # convert histogram values x and y to float, and make them equal len x = x.astype(np.float) y = y.astype(np.float) if len(x) < len(y): x = np.hstack((x, [0.0] * (support_size - len(x)))) elif len(y) < len(x): y = np.hstack((y, [0.0] * (support_size - len(y)))) emd = pyemd.emd(x, y, distance_mat) return np.exp(-emd * emd / (2 * sigma * sigma))
def wmd(edu_vecs, i, j, D_embed): """Compute the Word Mover's Distance between two EDUs. Parameters ---------- edu_vecs : sparse matrix One row per EDU. i : int Index of the first EDU. j : int Index of the second EDU. D_embed : dense matrix of np.double Distance matrix between each pair of word embeddings. Returns ------- s : np.double Word Mover's Distance between EDUs i and j. Notes ----- This function is an example implementation to compute the WMD on a pair of EDUs. You are however discouraged to use it as is if speed and memory matter. The recommended way is then to copy this function into your script, remove the parameters `edu_vecs` and `D_embed` from the signature of this function and have them point to variables from a wider scope (e.g. global variables from the module, even if they are defined in a conditional block such as `if __name__ == "__main__"`). This way, joblib.Parallel does not have to pickle parameters. An alternative course would be to memmap parameters as in the joblib.Parallel documentation, but it still runs an order of magnitude slower. """ v_1 = edu_vecs[i].toarray().ravel() v_2 = edu_vecs[j].toarray().ravel() # NB: emd() has an additional named parameter: extra_mass_penalty # pyemd by default sets it to -1, i.e. the max value in the distance # matrix s = emd(v_1, v_2, D_embed) return s
def worker(antigen): print antigen atg_list = backbone_antigens+[antigen] A_dat= A.data[atg_list][A.erythroid_mask] B_dat = B.data[atg_list][B.erythroid_mask] A_cluster = kmeans_clustering(A_dat,n_clusters=clusters,n_jobs=kmeans_jobs) B_cluster = kmeans_clustering(B_dat,n_clusters=clusters,n_jobs=kmeans_jobs) distance_matrix = cdist(A_cluster.output[atg_list], B_cluster.output[atg_list], 'cityblock') XA = A_cluster.output["cluster_size"].values.astype(np.double) XA = XA/np.sum(XA) XB = B_cluster.output["cluster_size"].values.astype(np.double) XB = XB/np.sum(XB) cost = emd(XA,XB,distance_matrix) print("{} has emd of {}".format(antigen,cost)) return cost
def Wmd_Distance(src_seq, cur_sent, embed_mat): #src_seq=[index for index in src_seq if index not in stopwords_list] #cur_sent=[index for index in cur_sent if index not in stopwords_list] word_set = list(set(src_seq + cur_sent)) vocab_len = len(word_set) vocab = [word for word in word_set] # 计算词之间的语义距离 distance_matrix = np.zeros((vocab_len, vocab_len)) for r in range(vocab_len): for c in range(vocab_len): distance_matrix[r][c] = compute_distance(vocab[r], vocab[c], embed_mat) # 计算归一化的词频概率 d1 = compute_normalized_word_freq(src_seq, word_set) d2 = compute_normalized_word_freq(cur_sent, word_set) # 计算词移距离 wmd_distance = emd(d1, d2, distance_matrix) # (0,1)内的值,越小越好 return wmd_distance - 1
def calculate_emd(input_distribution, output_distribution): ''' Calculate Earth Mover's Distance (aka Wasserstein distance) between two distributions of equal length. Parameters ---------- input_distribution : numpy.ndarray Probabilities assigned to style classes for an input text output_distribution : numpy.ndarray Probabilities assigned to style classes for an output text, e.g. of a style transfer model Returns ------- Earth Mover's Distance (float) between the two given style distributions ''' N = len(input_distribution) distance_matrix = np.ones((N, N)) return emd(input_distribution, output_distribution, distance_matrix)
def pos_distance(p1, p2, pos, P_, dat): p_1 = np.zeros(len(pos_word), dtype=np.double) p_2 = np.zeros(len(pos_word), dtype=np.double) p = {w: 0 for w in pos_word} k = 0 for i in p1: w = i if pos[w] in p: p[pos[w]] += 1 k += 1 #print k for w in range(len(pos_word)): if pos_word[w] in p: p_1[w] = p[pos_word[w]] p = {w: 0 for w in pos_word} k = 0 for i in p2: w = i if pos[w] in p: p[pos[w]] += 1 k += 1 #print k for w in range(len(pos_word)): if pos_word[w] in p: p_2[w] = p[pos_word[w]] #print p_1 #print p_2 p_1 = p_1.astype(np.double) p_2 = p_2.astype(np.double) if p_1.sum() != 0: p_11 = p_1 / p_1.sum() else: p_11 = p_1 if p_2.sum() != 0: p_22 = p_2 / p_2.sum() else: p_22 = p_2 #dat = 1 pos_score = dat * emd(p_11, p_22, P_) return pos_score
def wmdistance(model, words1, words2, all_distances): dictionary = gensim.corpora.Dictionary(documents=[words1, words2]) vocab_len = len(dictionary) # create bag of words from document def create_bow(doc): norm_bow = np.zeros(vocab_len, dtype=np.double) bow = dictionary.doc2bow(doc) for idx, count in bow: norm_bow[idx] = count / float(len(doc)) return norm_bow bow1 = create_bow(words1) bow2 = create_bow(words2) docset = set(words2) distances = create_distance_matrix(model, dictionary, docset, all_distances) return emd(bow1, bow2, distances)
def calc_wmd(self, doc1, doc2): vect = CountVectorizer().fit([doc1, doc2]) vec1, vec2 = vect.transform([doc1, doc2]) vec1 = vec1.toarray().ravel() vec2 = vec2.toarray().ravel() W = [] for w in vect.get_feature_names(): if w in self.word2vec.vocab: W.append(self.word2vec[w]) else: W.append(np.random.uniform(-1.0, 1.0, self.w2v_len)) D = euclidean_distances(W) # pyemd needs double precision input vec1 = vec1.astype(np.double) vec2 = vec2.astype(np.double) vec1 /= vec1.sum() vec2 /= vec2.sum() D = D.astype(np.double) D /= D.max() return emd(vec1, vec2, D)
def TopicDistance(topicA, topicB): """ calculate the Word Mover's Distance between two topics considering each words frequency """ # extract vocab vocab = set([w for w in topicA if w in wvmodel] + [w for w in topicB if w in wvmodel]) # create nBOW for each topic nBOW_A = np.array([0 if w not in topicA else topicA[w] for w in vocab], dtype=np.float64) nBOW_B = np.array([0 if w not in topicB else topicB[w] for w in vocab], dtype=np.float64) # build embedding distance Matrix embeddings = [wvmodel[w] for w in vocab] D_Mat = squareform(pdist(embeddings, 'euclidean')).astype(np.float64) # solve earth movers distance wmd = emd(nBOW_A, nBOW_B, D_Mat) # return return wmd
def get_wmd_distance(d1, d2, min_vocab=7, verbose=False): vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.vocab and w not in stop_words.ENGLISH_STOP_WORDS] if len(vocabulary) < min_vocab: return 1 vect = CountVectorizer(vocabulary=vocabulary, stop_words='english').fit([d1, d2]) v_1, v_2 = vect.transform([d1, d2]) W_ = np.array([model[w] for w in vect.get_feature_names() if w in model]) D_ = euclidean_distances(W_) D_ = D_.astype(np.double) D_ /= D_.max() # just for comparison purposes # v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() # pyemd needs double precision input v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 /= v_1.sum() v_2 /= v_2.sum() if verbose: print(vocabulary) print(v_1, v_2) return emd(v_1, v_2, D_)
def emd(x_vals, estimated_func_vals, true_func): assert (len(x_vals) == len(estimated_func_vals)) n = len(x_vals) true_func_vals = np.zeros(n) for i, x in enumerate(x_vals): true_func_vals[i] = true_func(x) distance_matrix = np.zeros((n,n)) for i in range(0, n): for j in range (0, n): distance_matrix[i][j] = abs(x_vals[i] - x_vals[j]) # print("distance_matrix=\n{}".format(distance_matrix)) estimated_func_vals = np.array(estimated_func_vals) # Convert list to np.array type emd = pyemd.emd(estimated_func_vals, true_func_vals, distance_matrix) # Expensive computation return emd
def EMD_between_two_models_on_board(model1_name, input_plains_num_1, i1, model2_name, input_plains_num_2, i2, board1, board2, width=6, height=6, use_gpu=True): model_file_1 = f'/home/lirontyomkin/AlphaZero_Gomoku/models/{model1_name}/current_policy_{i1}.model' policy_1 = PolicyValueNet(width, height, model_file=model_file_1, input_plains_num=input_plains_num_1, use_gpu=use_gpu) model_file_2 = f'/home/lirontyomkin/AlphaZero_Gomoku/models/{model2_name}/current_policy_{i2}.model' policy_2 = PolicyValueNet(width, height, model_file=model_file_2, input_plains_num=input_plains_num_2, use_gpu=use_gpu) board_current_state1 = board1.current_state(last_move=True, is_random_last_turn=False) board_current_state2 = board2.current_state(last_move=True, is_random_last_turn=False) acts_policy1, probas_policy1 = zip(*policy_1.policy_value_fn(board1)[0]) acts_policy2, probas_policy2 = zip(*policy_2.policy_value_fn(board2)[0]) dist_matrix = generate_matrix_dist_metric(width) distance = emd(np.asarray(probas_policy1, dtype='float64'), np.asarray(probas_policy2, dtype='float64'), dist_matrix) return distance
def perform_analysis(rv, fns, mn, mx): hist = [] nbins = 128 b = [] for i, fn in enumerate(fns): img, mask = rv[i][1:3] if mask is not None: img = img[mask == 1] # 128 bins with a consistent range h, b = np.histogram(img, bins=nbins, range=(mn, mx)) # Estimate the cumulative distribution function # Use cumulative sum to smooth the noise present in a histogram of quantized data cdf = np.cumsum(h * np.diff(b)) # Normalize, removing effect of size cdf /= np.sum(cdf) hist += [cdf] rv2 = np.zeros([len(rv), len(rv)]) bin_centers = (b[1:] + b[:-1]) / 2 # w = euclidean_distances(np.arange(nbins).reshape(-1, 1), np.arange(nbins).reshape(-1, 1)) / (nbins - 1.) w = euclidean_distances(bin_centers.reshape(-1, 1), bin_centers.reshape(-1, 1)) for i, fn_i in enumerate(fns): print(f'{i + 1}/{len(fns)}...') a = hist[i] for j, fn_j in enumerate(fns): b = hist[j] v = emd(a, b, w) rv2[i, j] = v np.savez('emd_results.npz', rv2) # np.savetxt('emd_results.csv', rv2, delimiter=',') print('To plot the EMD results run ./old/plot_emd_mat.py')
def points_best_cluster(self, centroids, dataPoint): """Takes the dataPoint and find the centroid index that it is closest too. Args: centroids: The list of centroids dataPoint: The dataPoint that is going to be determined which centroid it is closest too """ closestCentroid = None leastDistance = None matrix = numpy.array([[0, 1 / 3.0, 2 / 3.0], [1 / 3.0, 0, 1 / 3.0], [2 / 3.0, 1 / 3.0, 0]]) for i in range(len(centroids)): distance = emd(numpy.array(dataPoint), numpy.array(centroids[i]), matrix) #print(distance) if (leastDistance == None or distance < leastDistance): closestCentroid = i leastDistance = distance return closestCentroid
def WMDsimilarity(q1, q2): """Function to calculate the word mover distance between question 1 and queston 2 inputs = q1 is question1 and q2 is question2 """ ## feed the questions to countvectorizer vect = CountVectorizer.fit([q1, q2]) ## get the embeddings for words in the questions W_value = W[[vocab_dictionary[w] if w in vocab_dictionary.keys() else 0 for w in vect.get_feature_names() ]] data_vectors = euclidean_distances(W_value) ## transform the vectors of questions1 and 2 vector_1, vector_2 = vect.transform([q1, q2]) vector_1 = vector_1.toarray().ravel() vector_2 = vector_2.toarray().ravel() ## pyemd needs double precision input. hence converting to double precision vector_1 = vector_1.astype(np.double) vector_2 = vector_2.astype(np.double) vector_1 /= vector_1.sum() vector_2 /= vector_2.sum() data_vectors = data_vectors.astype(np.double) data_vectors /= data_vectors.max() return emd(vector_1, vector_2, data_vectors)
def solver_pyemd(self): while True: for s1_pos, s1_state in self.src_env.state2idx.items(): a = self.src_agent.get_best_action(s1_state, self.src_possible_actions) for s2_pos, s2_state in self.tgt_env.state2idx.items(): for b in range(self.action_space): kd = emd(self.src_env.tp_matrix[s1_state, a], self.tgt_env.tp_matrix[s2_state, b], self.dist_matrix) # pyemd new_val = self.opts.discount_r * self.reward_matrix_tmp[ s1_state, a, s2_state, b] + self.opts.discount_kd * kd self.d[s1_state, 0, s2_state, b] = new_val val = np.min(self.d[s1_state, 0, s2_state]) self.tmp_dist_matrix[s1_state, s2_state] = val if np.mean(np.abs(self.dist_matrix - self.tmp_dist_matrix)) < self.opts.threshold: self.dist_matrix = self.tmp_dist_matrix.copy() break self.dist_matrix = self.tmp_dist_matrix.copy()
def compute_wasserstein_PDF(NNeigh, nR, R): # from: # https://github.com/wmayner/pyemd dPDF = [] Ar = len(NNeigh[0]) KF = 1.0 / float(Ar) """ dmx = [] for i in range(Ar): tmp = [0.0] * Ar tmp[i] = KF dmx.append(tmp) """ dmx = [None] * Ar for i in range(Ar): dmx[i] = [0.0] * Ar for i in range(Ar): for j in range(i + 1, Ar): dmx[i][j] = float(j - i) * KF dmx[j][i] = dmx[i][j] print "dmx = ", len(dmx), len(dmx[0]) dmx = numpy.array(dmx) V = [] for i in range(nR): tmp = numpy.array(NNeigh[i], dtype=numpy.float) V.append(tmp) print "V = ", len(V), len(V[0]) for i in range(1, nR): print "i = ", i #P = numpy.array(NNeigh[i], dtype=numpy.float) #Q = numpy.array(NNeigh[i-1], dtype=numpy.float) #print "P = ", P, len(P) #print "Q = ", Q, len(Q) #D = emd(P, Q, dmx) D = emd(V[i], V[i - 1], dmx) dPDF.append([R[i], D]) return dPDF
def wmd(i, j): """Compute the Word Mover's Distance between two EDUs. This presupposes the existence of two global variables: * `edu_vecs` is a sparse 2-dimensional ndarray where each row corresponds to the vector representation of an EDU, * `D_common` is a dense 2-dimensional ndarray that contains the euclidean distance between each pair of word embeddings. Parameters ---------- i : int Index of the first EDU. j : int Index of the second EDU. Returns ------- s : np.double Word Mover's Distance between EDUs i and j. """ # EMD is extremely sensitive on the number of dimensions it has to # work with ; keep only the dimensions where at least one of the # two vectors is != 0 union_idx = np.union1d(edu_vecs[i].indices, edu_vecs[j].indices) # EMD segfaults on incorrect parameters: # * if both vectors (and thus the distance matrix) are all zeros, # return 0.0 (consider they are the same) if not np.any(union_idx): return 0.0 D_minimal = D_common[np.ix_(union_idx, union_idx)] bow_i = edu_vecs[i, union_idx].A.ravel() bow_j = edu_vecs[j, union_idx].A.ravel() # NB: emd() has an additional named parameter: extra_mass_penalty # pyemd by default sets it to -1, i.e. the max value in the distance # matrix return emd(bow_i, bow_j, D_minimal)
def wasserstein_distance(checkpoint_1, checkpoint_2): """Calculates the Wassterstein ("Earth Mover's") distance between the fixed points of 2 different checkpoints. Checkpoints must be separately analyzed to have fixed points and cluster means computed.""" cluster_means_1 = checkpoint_1['cluster_means'] cluster_labels_1 = checkpoint_1['cluster_labels'] cluster_weights_1 = [] for j in range(cluster_means_1.shape[0]): cluster_weights_1.append(len(np.where(cluster_labels_1 == j)[0])) cluster_weights_1 = np.array(cluster_weights_1) cluster_means_2 = checkpoint_2['cluster_means'] cluster_labels_2 = checkpoint_2['cluster_labels'] cluster_weights_2 = [] for j in range(cluster_means_2.shape[0]): cluster_weights_2.append(len(np.where(cluster_labels_2 == j)[0])) cluster_weights_2 = np.array(cluster_weights_2) hist1 = np.concatenate( [cluster_weights_1, np.zeros_like(cluster_weights_2)], axis=0).astype(np.float64) hist2 = np.concatenate( [np.zeros_like(cluster_weights_1), cluster_weights_2], axis=0).astype(np.float64) N = len(cluster_weights_1) + len(cluster_weights_2) combined_means = np.concatenate([cluster_means_1, cluster_means_2], axis=0) distances = np.zeros((N, N)) for i in range(N): for j in range(N): distances[i, j] = norm(combined_means[i] - combined_means[j]) return emd(hist1, hist2, distances)
def emd(p1,p2,dmat): """ Compute the sparsing Earth-Mover distance between two descreted distributio -ns: p1, p2 with the metric described by: dmat Our sparsing algorithm is: Use those points on which p1 + p2 value is greater than tol (1e-6 defau -lt) to construct two new distributions and compute their emd. The reas -on is that the emd will only depends on those points on which p1, p2 a -re not too small. Input: p1: 1d non-negative np.array of size (N,) of a flattened distribution p2: 1d non-negative np.array of size (N,) of a flattened distribution dmat: np.array of size (N,N), dmat[i,j] is the distance between the i-th and j-th points Output: The Earth-Mover distance between p1, p2 with metric dmat """ msk = (p1+p2) >tol usdp1 = p1[msk] usdp2 = p2[msk] usdd = np.transpose(dmat[msk])[msk] return pyemd.emd(usdp1,usdp2,usdd)
def quantile_emd(column1: CorrelationClusteringColumn, column2: CorrelationClusteringColumn, quantiles: int = 256): """ Computes the Earth Mover's Distance (EMD) over two column quantile histograms If the argument `quantiles` isn't passed in, the default of the paper "Automatic Discovery of Attributes in Relational Databases" is used which is 256. Parameters --------- column1 : Column The first column column2 : Column The second column that we create its quantile histogram by doing a linear scan over the first's quantiles: int, optional The number of quantiles that the histograms are split on (default is 256) Returns ------- float the EMD value between column1 and column2 """ if column1.size == 0 or column2.size == 0: return math.inf histogram1 = column1.get_histogram() histogram2 = QuantileHistogram(column2.long_name, column2.ranks, column2.size, quantiles, reference_hist=histogram1) if histogram2.is_empty: return math.inf return emd(histogram1.get_values, histogram2.get_values, histogram1.dist_matrix)
def component_merge_emd(net1, net2, metric_space): netx1 = nx.from_scipy_sparse_matrix(net1.adjacency_matrix) netx2 = nx.from_scipy_sparse_matrix(net2.adjacency_matrix) cc1 = list(nx.connected_components(netx1)) cc2 = list(nx.connected_components(netx2)) net1 = build_induced_graph(net1, cc1, metric_space) net2 = build_induced_graph(net2, cc2, metric_space) a1 = np.diag(net1.adjacency_matrix.toarray()) a1 = a1 / a1.sum() a2 = np.diag(net2.adjacency_matrix.toarray()) a2 = a2 / a2.sum() a1aug = np.concatenate([a1, np.zeros_like(a2)]) a2aug = np.concatenate([np.zeros_like(a1), a2]) dists = network_merge_distance(net1, net2, metric_space) a = np.hstack([np.zeros((a1.shape[0], a1.shape[0])), dists]) b = np.hstack([dists.T, np.zeros((a2.shape[0], a2.shape[0]))]) c = np.vstack([a, b]) return pyemd.emd(a1aug, a2aug, c)
def solver_pyemd(self): while True: for s1_pos, s1_state in self.src_env.state2idx.items(): for s2_pos, s2_state in sorted(self.tgt_env.state2idx.items()): for a in range(self.action_space): for b in range(self.action_space): kd = emd(self.src_env.tp_matrix[s1_state, a], self.tgt_env.tp_matrix[s2_state, b], self.dist_matrix) # pyemd new_val = self.opts.discount_r * self.reward_matrix_tmp[ s1_state, a, s2_state, b] + self.opts.discount_kd * kd self.d[s1_state, a, s2_state, b] = new_val d_st = self.d[s1_state, :, s2_state, :] val = max(np.max(np.min(d_st, axis=1)), np.max(np.min(d_st, axis=0))) self.tmp_dist_matrix[s1_state, s2_state] = val if np.mean(np.abs(self.dist_matrix - self.tmp_dist_matrix)) < self.opts.threshold: self.dist_matrix = self.tmp_dist_matrix.copy() break self.dist_matrix = self.tmp_dist_matrix.copy()
def wmd(self, sent1="this is sentence", sent2="this is sentence") -> float: ''' Calculates the word movers distance between two sentences. sent1,sent2: Two input sentences in text form ''' sp1 = self._sent_to_sparse(sent1) sp2 = self._sent_to_sparse(sent2) if sp1 is None or sp2 is None: return union_idx = np.union1d(sp1.indices, sp2.indices) W = sklearn.metrics.euclidean_distances(self.embedding[union_idx]) W = W.astype("float64") sp1 = sp1[:, union_idx].A.ravel() sp2 = sp2[:, union_idx].A.ravel() print(sp1,sp2) #import sys #def hook(f, *_): # print(f.f_code.co_name) #sys.setprofile(hook) return pyemd.emd(sp1, sp2, W)
def worker(antigen): print antigen atg_list = backbone_antigens + [antigen] A_dat = A.data[atg_list][A.erythroid_mask] B_dat = B.data[atg_list][B.erythroid_mask] A_cluster = kmeans_clustering(A_dat, n_clusters=clusters, n_jobs=kmeans_jobs) B_cluster = kmeans_clustering(B_dat, n_clusters=clusters, n_jobs=kmeans_jobs) distance_matrix = cdist(A_cluster.output[atg_list], B_cluster.output[atg_list], 'cityblock') XA = A_cluster.output["cluster_size"].values.astype(np.double) XA = XA / np.sum(XA) XB = B_cluster.output["cluster_size"].values.astype(np.double) XB = XB / np.sum(XB) cost = emd(XA, XB, distance_matrix) print("{} has emd of {}".format(antigen, cost)) return cost
raise AnnoyingError('F**k this') H1, _, _ = np.histogram2d(*D1.T, bins=(bx, by)) H2, _, _ = np.histogram2d(*D2.T, bins=(bx, by)) H1 /= H1.sum() H2 /= H2.sum() _x, _y = np.indices(H1.shape) coords = np.array(zip(_x.ravel(), _y.ravel())) D = distance(coords, coords) return emd(H1.ravel(), H2.ravel(), D) def _calculate_emd_1D(D1, D2, bins=40): """ Args: ----- D1, D2: two np arrays with potentially differing numbers of rows, but two columns. The empirical distributions you want a similarity over bins: number of bins in each dim """ D1 = D1[np.isnan(D1).sum(axis=-1) < 1] D2 = D2[np.isnan(D2).sum(axis=-1) < 1]
def word_mover_score(ngram, refs, hyps, batch_size=256, device='cuda:0'): idf_dict_ref = defaultdict(lambda: 1.) idf_dict_hyp = defaultdict(lambda: 1.) preds = [] for batch_start in range(0, len(refs), batch_size): batch_refs = refs[batch_start:batch_start + batch_size] batch_hyps = hyps[batch_start:batch_start + batch_size] ref_embedding, ref_lens, ref_masks, ref_idf, ref_tokens = get_bert_embedding( batch_refs, model, tokenizer, idf_dict_ref, device=device) hyp_embedding, hyp_lens, hyp_masks, hyp_idf, hyp_tokens = get_bert_embedding( batch_hyps, model, tokenizer, idf_dict_hyp, device=device) ref_embedding.div_(torch.norm(ref_embedding, dim=-1).unsqueeze(-1)) hyp_embedding.div_(torch.norm(hyp_embedding, dim=-1).unsqueeze(-1)) ref_embedding_max, _ = torch.max(ref_embedding[-5:], dim=0, out=None) hyp_embedding_max, _ = torch.max(hyp_embedding[-5:], dim=0, out=None) ref_embedding_min, _ = torch.min(ref_embedding[-5:], dim=0, out=None) hyp_embedding_min, _ = torch.min(hyp_embedding[-5:], dim=0, out=None) ref_embedding_avg = ref_embedding[-5:].mean(0) hyp_embedding_avg = hyp_embedding[-5:].mean(0) ref_embedding = torch.cat( [ref_embedding_min, ref_embedding_avg, ref_embedding_max], -1) hyp_embedding = torch.cat( [hyp_embedding_min, hyp_embedding_avg, hyp_embedding_max], -1) num_refs = len(ref_embedding) for i in range(num_refs): ref_ids = range(0, len(ref_tokens[i])) hyp_ids = range(0, len(hyp_tokens[i])) ref_embedding_i, ref_idf_i = load_ngram(ref_ids, ref_embedding[i], ref_idf[i], ngram, 1) hyp_embedding_i, hyp_idf_i = load_ngram(hyp_ids, hyp_embedding[i], hyp_idf[i], ngram, 1) raw = torch.cat([ref_embedding_i, hyp_embedding_i], 0) raw.div_(torch.norm(raw, dim=-1).unsqueeze(-1) + 0.000001) distance_matrix = pairwise_distances(raw, raw) c1 = np.zeros(len(ref_idf_i) + len(hyp_idf_i), dtype=np.double) c2 = np.zeros(len(ref_idf_i) + len(hyp_idf_i), dtype=np.double) c1[:len(ref_idf_i)] = ref_idf_i c2[-len(hyp_idf_i):] = hyp_idf_i c1 = _safe_divide(c1, np.sum(c1)) c2 = _safe_divide(c2, np.sum(c2)) score = 1 - emd(c1, c2, distance_matrix.double().cpu().numpy()) preds.append(score) return preds #system = ['This is test summary'] #references = ['This is ref summary two','this is test summary'] #score = word_mover_score(2, references, system * len(references)) #print(score)
def myfun1(pair, datai, dataj, W_dist): value = pyemd.emd(datai.values, dataj.values, W_dist) return (value, pair)
f_t = np.load(feature_dir + '%s.npy' % td) w_s = np.load(feature_dir + '%s_weight.npy' % sd) w_t = np.load(feature_dir + '%s_weight.npy' % td) f_s = f_s[idx, :] w_s = w_s[idx] # Make sure two histograms have the same length and distance matrix is square. data = np.float64(np.append(f_s, f_t, axis=0)) w_1 = np.zeros((len(w_s) + len(w_t), ), np.float64) w_2 = np.zeros((len(w_s) + len(w_t), ), np.float64) w_1[:len(w_s)] = w_s / np.sum(w_s) w_2[len(w_s):] = w_t / np.sum(w_t) D = euclidean_distances(data, data) emd = pyemd.emd(np.float64(w_1), np.float64(w_2), np.float64(D)) domain_similarity = np.exp(-gamma * emd) similarity_matrix[sd][td] = domain_similarity print('EMD: %.3f Domain Similarity: %.3f\n' % (emd, domain_similarity)) np.save('cui_similarity_mat.npy', similarity_matrix) print('Elapsed time: %.3fs' % (time.time() - tic)) np.fill_diagonal(similarity_matrix, np.nan) def draw_figure_to_plt(distance_matrix, names, label_size=14): fig = plt.figure(figsize=(15 / 25. * len(names), 15 / 25. * len(names))) ax = plt.gca() plt.imshow(distance_matrix, cmap='viridis_r') ax.set_xticks(np.arange(len(names)))
def two_sentence_dis(sentence1, sentence2): if not model_dic.has_key('embedding'): with open(BASE_DIR + 'embedding.pkl', 'rb') as vocab: model_dic['embedding'] = pickle.load(vocab) len_sentence1 = len(sentence1) len_sentence2 = len(sentence2) # Remove out-of-vocabulary words. sentence1 = [ model_dic['word_index'].get(token) for token in sentence1 if model_dic['word_index'].has_key(token) ] sentence2 = [ model_dic['word_index'].get(token) for token in sentence2 if model_dic['word_index'].has_key(token) ] diff1 = len_sentence1 - len(sentence1) diff2 = len_sentence2 - len(sentence2) if diff1 > 0 or diff2 > 0: print( 'Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2) if len(sentence1) == 0 or len(sentence2) == 0: print('At least one of the documents had no words that were' 'in the vocabulary. Aborting (returning inf).') return float('inf') dictionary_temp = list(set(sentence1 + sentence2)) dictionary = dict(enumerate(dictionary_temp)) vocab_len = len(dictionary) sen_set1 = set(sentence1) sen_set2 = set(sentence2) distance_matrix = np.zeros((vocab_len, vocab_len), dtype=double) for i, t1 in dictionary.items(): for j, t2 in dictionary.items(): if not t1 in sen_set1 or not t2 in sen_set2: continue # 计算距离 distance_matrix[i, j] = sqrt( np_sum((model_dic['embedding'][t1] - model_dic['embedding'][t2])**2)) if np_sum(distance_matrix) == 0.0: # `emd` gets stuck if the distance matrix contains only zeros. print('The distance matrix is all zeros. Aborting (returning inf).') return float('inf') def doc2bow(document, dictionary): freq_dic = dict() for i in document: if freq_dic.has_key(i): freq_dic[i] = freq_dic[i] + 1 else: freq_dic[i] = 1 return_freq = dict() for i in range(len(document)): if return_freq.has_key(i): for key in range(len(dictionary)): if dictionary[key] == document[i]: return_freq[key] = freq_dic[document[i]] else: for key in range(len(dictionary)): if dictionary[key] == document[i]: return_freq[key] = freq_dic[document[i]] return return_freq def nbow(document): d = zeros(vocab_len, dtype=double) nbow = doc2bow(document, dictionary) # Word frequencies. doc_len = len(document) for (idx, freq) in nbow.items(): #for idx, freq in nbow: d[idx] = float(freq) / float( doc_len) # Normalized word frequencies. return d # Compute nBOW representation of documents. d1 = nbow(sentence1) d2 = nbow(sentence2) # Compute WMD. #print pyemd.emd(d1, d2, distance_matrix) return pyemd.emd(d1, d2, distance_matrix)
d1 = f.read().replace('\n', '') with open('/home/avinash/Downloads/autonomous_soft_robot.txt') as f: d2 = f.read().replace('\n', '') #d2 = "The President addresses the press in Chicago" vect = CountVectorizer(stop_words="english").fit([d1, d2]) print("Features:", ", ".join(vect.get_feature_names())) v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() print(v_1, v_2) print("cosine(doc_1, doc_2) = {:.2f}".format(cosine(v_1, v_2))) W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] D_ = euclidean_distances(W_) # pyemd needs double precision input v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 /= v_1.sum() v_2 /= v_2.sum() D_ = D_.astype(np.double) D_ /= D_.max() # just for comparison purposes print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))
def compute_d(least_fixed_iters=10, threshold=0.00001, emd_func='cv2', use_manhattan_as_d=False): """ Computes state-action and state-state bisimulation metric Parameters: least_fixed_iters (int): Number of iterations of random init and solving threshold (float): Threshold value for stopping the solver for distance matrix emd (str): Specify which function to use for calculating the Earth Mover's distance or Wasserstein distance or Kantorovich metric Options: ['scipy', 'cv2', 'opt', 'pyemd'] use_manhattan_as_d (bool): If True, use manhattan distance as distance matrix Returns: d_final: Bisimulation state-action metric with dim (S1 x a x S2 x b) dist_matrix_final: Bisimulation state-state metric with dim (S1 x S2) """ print ("EMD computed using: ", emd_func) print ("Number of lfp iterations: ", least_fixed_iters) print ("Threshold value: ", threshold) print ("Source size: ", src_state_space, " Target size: ", tgt_state_space) if emd_func == 'pyemd': reward_matrix_tmp = np.zeros((src_state_space, action_space, tgt_state_space, action_space)) reward_matrix = np.zeros((src_state_space, tgt_state_space)) dist_matrix_final = np.zeros((src_state_space, tgt_state_space)) d_final = np.zeros((src_state_space, action_space, tgt_state_space, action_space)) else: reward_matrix_tmp = np.zeros((src_state_space, action_space, tgt_state_space, action_space)).astype(np.float32) reward_matrix = np.zeros((src_state_space, tgt_state_space)).astype(np.float32) src_env.tp_matrix = src_env.tp_matrix.astype(np.float32) tgt_env.tp_matrix = tgt_env.tp_matrix.astype(np.float32) dist_matrix_final = np.zeros((src_state_space, tgt_state_space)).astype(np.float32) d_final = np.zeros((src_state_space, action_space, tgt_state_space, action_space)).astype(np.float32) for s1_pos, s1_state in src_env.state2idx.items(): src_env.position = s1_pos src_env.start_position = s1_pos for s2_pos, s2_state in tgt_env.state2idx.items(): tgt_env.position = s2_pos tgt_env.start_position = s2_pos for a in range(action_space): next_state, reward_a, done, next_possible_states = src_env.step(a) src_env.start_position = s1_pos src_env.position = s1_pos for b in range(action_space): next_state, reward_b, done, next_possible_states = tgt_env.step(b) reward_matrix_tmp[s1_state, a, s2_state, b] = math.fabs(reward_a - reward_b) tgt_env.start_position = s2_pos tgt_env.position = s2_pos for s1_pos, s1_state in src_env.state2idx.items(): for s2_pos, s2_state in tgt_env.state2idx.items(): reward_matrix[s1_state, s2_state] = np.max(reward_matrix_tmp[s1_state,:,s2_state,:]) # Supply Manhattan distance as an alternative to reward distance for calculation of EMD # DO NOT USE when S1 and S2 are of different sizes manhattan_distance = np.zeros((src_env.state_space, tgt_env.state_space)) for s1_pos, s1_state in src_env.state2idx.items(): for s2_pos, s2_state in tgt_env.state2idx.items(): manhattan_distance[s1_state, s2_state] = distance.cityblock(s1_pos, s2_pos) dist_matrix_final.fill(1000.0) d_final.fill(1000.0) if emd_func == 'pyemd': d = np.zeros((src_state_space, action_space, tgt_state_space, action_space)) dist_matrix = np.zeros((src_state_space, tgt_state_space)) dist_matrix.fill(0.01) tmp_dist_matrix = np.zeros((src_state_space, tgt_state_space)) else: d = np.zeros((src_state_space, action_space, tgt_state_space, action_space)).astype(np.float32) dist_matrix = np.random.rand(src_state_space, tgt_state_space).astype(np.float32) tmp_dist_matrix = np.random.rand(src_state_space, tgt_state_space).astype(np.float32) for i in range(least_fixed_iters): print ("Iteration: ", i, "/", least_fixed_iters, " Loss: ", np.mean(np.abs(dist_matrix_final - dist_matrix))) dist_matrix_final = dist_matrix.copy() ctr = 0 while True: for s1_pos, s1_state in src_env.state2idx.items(): for s2_pos, s2_state in sorted(tgt_env.state2idx.items()): for a in range(action_space): for b in range(action_space): kd = emd(src_env.tp_matrix[s1_state,a], tgt_env.tp_matrix[s2_state,b], dist_matrix) # pyemd new_val = 0.1 * reward_matrix_tmp[s1_state, a, s2_state, b] + 0.9 * kd d[s1_state, a, s2_state, b] = new_val d_st = d[s1_state, :, s2_state, :] val = max(np.max(np.min(d_st, axis=1)), np.max(np.min(d_st, axis=0))) tmp_dist_matrix[s1_state, s2_state] = val if np.mean(np.abs(dist_matrix - tmp_dist_matrix)) < threshold: dist_matrix = tmp_dist_matrix.copy() break dist_matrix = tmp_dist_matrix.copy() dist_matrix_final = dist_matrix.copy() d_final = d.copy() return d_final, dist_matrix_final