Beispiel #1
0
def test_emd_validate_larger_signatures_1():
    first_signature = np.array([0.0, 1.0, 2.0])
    second_signature = np.array([5.0, 3.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5],
                                [0.5, 0.0]])
    with pytest.raises(ValueError):
        emd(first_signature, second_signature, distance_matrix)
Beispiel #2
0
def test_symmetric_distance_matrix():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5, 3.0],
                                [0.5, 0.0]])
    with pytest.raises(ValueError):
        emd(first_signature, second_signature, distance_matrix)
Beispiel #3
0
 def test_error_wrong_distance_matrix_ndim(self):
     first_signature = np.array([6.0, 1.0])
     second_signature = np.array([1.0, 7.0])
     distance_matrix = np.array([[[0.0, 1.0],
                                 [1.0, 0.0]]])
     with self.assertRaises(ValueError):
         emd(first_signature, second_signature, distance_matrix)
Beispiel #4
0
 def test_error_different_signature_lengths(self):
     first_signature = np.array([6.0, 1.0, 9.0])
     second_signature = np.array([1.0, 7.0])
     distance_matrix = np.array([[0.0, 1.0],
                                 [1.0, 0.0]])
     with self.assertRaises(ValueError):
         emd(first_signature, second_signature, distance_matrix)
Beispiel #5
0
def test_emd_validate_different_signature_dims():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5, 0.0],
                                [0.5, 0.0, 0.0],
                                [0.5, 0.0, 0.0]])
    with pytest.raises(ValueError):
        emd(first_signature, second_signature, distance_matrix)
Beispiel #6
0
def calc_wmd(d1, d2, dm, vob_index_dict):

    u1 = set(d1)
    u2 = set(d2)
    du = u1.union(u2)

    f1 = np.array(nBOW(d1, du))
    f2 = np.array(nBOW(d2, du))


    dul = len(du)
    dum = np.zeros((dul, dul), dtype=np.float)
    du_list = list(du)
    processed_list = []
    for i, t1 in enumerate(du_list):
        processed_list.append(i)

        for j, t2 in enumerate(du_list):
            if j in processed_list:
                continue

            dist_matrix_x = vob_index_dict[t1]
            dist_matrix_y = vob_index_dict[t2]
            dist = dm[dist_matrix_x, dist_matrix_y]

            dum[i][j] = dist
            dum[j][i] = dist

    return emd(f1, f2, dum)
Beispiel #7
0
def score_word2vec_wmd(src, dst, wv):
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	vectorizer = CountVectorizer()
	vectors=vectorizer.fit_transform(b1 + b2)
	common = [word for word in vectorizer.get_feature_names() if word in wv]
	W_common = [wv[w] for w in common]
	vectorizer = CountVectorizer(vocabulary=common, dtype=np.double)
	b1_v = vectorizer.transform(b1)
	b2_v = vectorizer.transform(b2)

	D_ = sklearn.metrics.euclidean_distances(W_common)
	D_ = D_.astype(np.double)
	D_ /= D_.max()

	b1_vecs = b1_v.toarray()
	b2_vecs = b1_v.toarray()
	b1_vecs /= b1_v.sum()
	b2_vecs /= b2_v.sum()
	b1_vecs = b1_vecs.astype(np.double)
	b2_vecs = b2_vecs.astype(np.double)

	res = [round(emd(b1_vecs[i], b2_vecs[i], D_),2) for i in range(lines)]
	
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
	print src + ' finished!'
Beispiel #8
0
def wordMoverDistance(d1, d2):
    ###d1 list
    ###d2 list
    # Rule out words that not in vocabulary
    d1 = " ".join([w for w in d1 if w in vocab_dict])
    d2 = " ".join([w for w in d2 if w in vocab_dict])
    #print d1
    #print d2
    vect = CountVectorizer().fit([d1,d2])
    feature_names = vect.get_feature_names()
    W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix
    D_ = euclidean_distances(W_) # Distance Matrix
    D_ = D_.astype(np.double)
    #D_ /= D_.max()  # Normalize for comparison
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    ### EMD
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))
    emd_d = emd(v_1, v_2, D_) ## WMD
    #print emd_d
    return emd_d
Beispiel #9
0
def test_emd_3():
    first_signature = np.array([6.0, 1.0])
    second_signature = np.array([1.0, 7.0])
    distance_matrix = np.array([[0.0, 0.0],
                                [0.0, 0.0]])
    emd_assert(
        emd(first_signature, second_signature, distance_matrix),
        0.0
    )
Beispiel #10
0
def test_emd_1():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5],
                                [0.5, 0.0]])
    emd_assert(
        emd(first_signature, second_signature, distance_matrix),
        3.5
    )
    def _wh_ne_distance(self, other, w):
        c1 = getattr(self, w)
        c2 = getattr(other, w)
        
        if not len(c1) or not len(c2):
            # one of them has nothing to compare; distance is np.nan
            return np.nan

        s1 = sorted(c1.keys(), key=lambda k: c1[k], reverse=True)
        s2 = sorted(c2.keys(), key=lambda k: c2[k], reverse=True)

        if self.max_nes > 0:
            penalty = max(
                sum(
                    c1[w] 
                    for w in s1[self.max_nes:]
                ), sum(
                    c2[w]
                    for w in s2[self.max_nes:]
                )
            )

            s1 = s1[:self.max_nes]
            s2 = s2[:self.max_nes]
        else:
            penalty = 0

        # penalty will make up for those documents that have low-scoring
        # NEs, meaning they should not be compared with other news items
        # since this method would not have meaning with them

        matrix, nes = NE.matrix(set(s1).union(set(s2)))
        
        if not nes:
            # Not a single NE to compare; distance is np.nan
            return np.nan
        
        nes = [ne.lower() for ne in nes] # NE.matrix returns Titles
        v1 = np.array([ c1[ne] for ne in nes ])
        v2 = np.array([ c2[ne] for ne in nes ])

        # Make it sum 1
        s = v1.sum()
        if s > 0:
            v1 /= s

        s = v2.sum()
        if s > 0:
            v2 /= s

        # Now compute emd of the two vectors.
        # That distance is in [0, 1]
        # By multiplying per (1 - penalty) and adding penalty,
        # you ensure distance is in [penalty, 1],
        # penalty being the maximum uncertainty there is in each of the vectors.
        return (1 - penalty) * emd(v1, v2, matrix) + penalty
Beispiel #12
0
def hamming_emd(d1, d2):
    """Return the Earth Mover's Distance between two distributions (indexed
    by state, one dimension per node).

    Singleton dimensions are sqeezed out.
    """
    d1, d2 = d1.squeeze(), d2.squeeze()
    # Compute the EMD with Hamming distance between states as the
    # transportation cost function.
    return emd(d1.ravel(), d2.ravel(), _hamming_matrix(d1.ndim))
Beispiel #13
0
def hamming_emd(d1, d2):
    """Return the Earth Mover's Distance between two distributions (indexed
    by state, one dimension per node) using the Hamming distance between states
    as the transportation cost function.

    Singleton dimensions are sqeezed out.
    """
    N = d1.squeeze().ndim
    d1, d2 = flatten(d1), flatten(d2)
    return emd(d1, d2, _hamming_matrix(N))
def dist_hist(X,Y,distance_matrices) :
    start=0
    size=0
    l=[]
    for M in distance_matrices :
        size=M.shape[0]
        l.append(emd(X[start:(start+size)],Y[start:(start+size)],M))

        start+=size
    return np.linalg.norm(l)
def dist_hist_withoutnullhist(X,Y,distance_matrices) :
    start=0
    size=0
    l=[]
    for M in distance_matrices :
        size=M.shape[0]
        if sum(X[start:(start+size)]) != 0.0 and sum(Y[start:(start+size)]) != 0.0 :
            l.append(emd(X[start:(start+size)],Y[start:(start+size)],M))
        start+=size
    return np.linalg.norm(l)
 def _wmd(self, i, row, X_train):
     """Compute the WMD between training sample i and given test row.
     
     Assumes that `row` and train samples are sparse BOW vectors summing to 1.
     """
     union_idx = np.union1d(X_train[i].indices, row.indices) - 1
     W_minimal = self.W_embed[union_idx]
     W_dist = euclidean_distances(W_minimal)
     bow_i = X_train[i, union_idx].A.ravel()
     bow_j = row[:, union_idx].A.ravel()
     return emd(bow_i, bow_j, W_dist)
Beispiel #17
0
def test_emd_extra_mass_penalty():
    first_signature = np.array([0.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0],
                                [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0],
                                [2.0, 1.0, 1.0, 0.0]])
    emd_assert(
        emd(first_signature, second_signature, distance_matrix,
            extra_mass_penalty=2.5),
        4.5
    )
Beispiel #18
0
def hist_emd(reference_hist_df, compare_hist_df, key, distance_matrix=None):
    #Merge the two columns on the union of delays
    merged_df = pd.merge(reference_hist_df, compare_hist_df, how='outer', left_index=True, right_index=True)
    merged_df.fillna(0., inplace=True) #Treat missing values as zero

    ref_merged_key = key + '_x'
    comp_merged_key = key + '_y'

    if distance_matrix == None:
        #Unspecified, calculate
        distance_matrix = calc_distance_matrix(merged_df.index, merged_df.index)

    return emd(merged_df[ref_merged_key].values, merged_df[comp_merged_key].values, distance_matrix)
Beispiel #19
0
 def __sub__(self, other):
     """
     Earth-mover's distance (EMD) between two histograms.
     Calculated for channels separately and summed up.
     """
     result = sum([
         emd(
             pair[0].astype(np.float),
             pair[1].astype(np.float),
             Histogram._L1_DISTANCE_MATRIX
         )
         for pair in zip(self.channels, other.channels)
     ])
     return result
Beispiel #20
0
def word_movers(doc1, doc2, metric='cosine'):
    """
    Measure the semantic similarity between two documents using Word Movers
    Distance.

    Args:
        doc1 (``textacy.Doc`` or ``spacy.Doc``)
        doc2 (``textacy.Doc`` or ``spacy.Doc``)
        metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'})

    Returns:
        float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0],
            where larger values correspond to more similar documents

    References:
        Ofir Pele and Michael Werman, "A linear time histogram metric for improved
            SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
        Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
            in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
        Kusner, Matt J., et al. "From word embeddings to document distances."
            Proceedings of the 32nd International Conference on Machine Learning
            (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
    """
    stringstore = StringStore()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector:
            if stringstore[word.text] - 1 == n:  # stringstore[0] always empty space
                word_vecs.append(word.vector)
                n += 1
    distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(np.double)
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(
        stringstore[word.text] - 1
        for word in extract.words(doc1)
        if word.has_vector)
    vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))]).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(
        stringstore[word.text] - 1
        for word in extract.words(doc2)
        if word.has_vector)
    vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))]).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
Beispiel #21
0
    def wordMoversDistance(self, s1, s2):
        vect = CountVectorizer(stop_words="english").fit([s1, s2])
        
        v_1, v_2 = vect.transform([s1, s2])
        v_1 = v_1.toarray().ravel()
        v_2 = v_2.toarray().ravel()

        W_ = self.W[[self.vocab_dict[w] if w in self.vocab_dict else self.vocab_dict[self.vocab_dict.keys()[0]] for w in vect.get_feature_names()]]
        D_ = euclidean_distances(W_)

        v_1 = v_1.astype(np.double)
        v_2 = v_2.astype(np.double)
        D_ = D_.astype(np.double)
        
        return emd(v_1, v_2, D_)
def emd_distance(x, y, distance_scaling=1.0):
    support_size = max(len(x), len(y))
    d_mat = toeplitz(range(support_size)).astype(np.float)
    distance_mat = d_mat / distance_scaling

    # convert histogram values x and y to float, and make them equal len
    x = x.astype(np.float)
    y = y.astype(np.float)
    if len(x) < len(y):
        x = np.hstack((x, [0.0] * (support_size - len(x))))
    elif len(y) < len(x):
        y = np.hstack((y, [0.0] * (support_size - len(y))))

    emd = pyemd.emd(x, y, distance_mat)
    return emd
Beispiel #23
0
    def distance(self, stat, distance_type):
        assert(isinstance(stat, Histogram))
        if distance_type == DistanceType.EARTH_MOVER:
            bin_locs = np.mean([self.bin_edges[:-1], self.bin_edges[1:]], axis=0)
            bins = len(bin_locs)

            distance_matrix = np.abs(np.repeat(bin_locs, bins) - np.tile(bin_locs, bins))
            distance_matrix = distance_matrix.reshape(bins, bins)
            assert(len(distance_matrix) == len(distance_matrix[0]))
            assert(self.data.shape[0] <= len(distance_matrix))
            assert(stat.data.shape[0] <= len(distance_matrix))

            return emd(self.data.astype(np.float64), stat.data.astype(np.float64), distance_matrix.astype(np.float64))
        else:
            return Distrib.distance(self, stat, distance_type)
Beispiel #24
0
def earth_movers_distance(distance_matrix, image1, image2):
    """Returns Earth Mover's Distance for image1 and image2.

    distance_matrix is an  N x N distance matrix where N = x * y * z
    where the shape of image1 and image2 are (x, y, z).
    distance_matrix[i][j] gives the distance between the ith and jth
    element of an unraveled image. See numpy.ravel() for details on
    how a three dimensional array is converted to a one dimensional
    array.
    """

    # turn voxel activations into probability distributions
    image1, image2 = [np.clip(img, 0, 999) for img in (image1, image2)]
    image1, image2 = [img / np.sum(img) for img in (image1, image2)]

    result = pyemd.emd(image1.ravel(), image2.ravel(), distance_matrix)
    return result
Beispiel #25
0
def wmd(s1, s2):
    vect = CountVectorizer(stop_words="english").fit([s1, s2])
    
    v_1, v_2 = vect.transform([s1, s2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()

    W_ = W[[vocab_dict[w] if w in vocab_dict else vocab_dict[vocab_dict.keys()[0]] for w in vect.get_feature_names()]]

    D_ = euclidean_distances(W_)

    # pyemd needs double precision input
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    D_ = D_.astype(np.double)

    return emd(v_1, v_2, D_)
Beispiel #26
0
def similarity(doc1, doc2):
    """
    These code is from
    http://vene.ro/blog/word-movers-distance-in-python.html
    """
    W, vocab_dict = wordEmbedding()
    _doc1 = doc1
    _doc2 = doc2
    vect = CountVectorizer(stop_words="english").fit([_doc1, _doc2])
    # print("Features:",  ", ".join(vect.get_feature_names()))
    # It seems like some specific number is missing in voca_dict.
    # Just simply ingore them

    # Word check... Then no word check...
    newFeature = list()
    for w in vect.get_feature_names():
        try:
            vocab_dict[unidecode(w)]
            newFeature.append(unidecode(w))
        except KeyError:
            pass
            
    
    v_1 = [0 for w in newFeature]
    v_2 = [0 for w in newFeature]
    
    for wNo, w in enumerate(newFeature):
        if w in _doc1:
            v_1[wNo] = 1
        elif w in _doc2:
            v_2[wNo] = 1
                
    
    W_ = W[[vocab_dict[w] for w in newFeature]]        
    D_ = euclidean_distances(W_)
    v_1 = numpy.asarray(v_1)
    v_2 = numpy.asarray(v_2)
    v_1 = v_1.astype(numpy.double)
    v_2 = v_2.astype(numpy.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()

    D_ = D_.astype(numpy.double)
    D_ /= D_.max()
    
    return emd(v_1, v_2, D_)
def WMD(document1, document2, model):
    '''
    Compute WMD.

    Input:
    document1:      List of words.
    document2:      List of words.
    model:          Word2vec model, providing the word embeddings.
    vocab:          Set of words in all documents.

    Returns:        WMD between documents, float.
    '''

    # Remove out-of-vocabulary words.
    len_pre_oov1 = len(document1)
    len_pre_oov2 = len(document2)
    document1 = [token for token in document1 if token in model.vocab.keys()]
    document2 = [token for token in document2 if token in model.vocab.keys()]
    logging.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', len_pre_oov1 - len(document1), len_pre_oov2 - len(document2))

    if len(document1) == 0 or len(document2) == 0:
        logging.info('At least one of the documents had no words that were in the vocabulary. Aborting (returning NaN).')
        return float('nan')

    vocab = set(document1 + document2)

    # Compute nBOW representation of documents.
    d1 = np.array(nBOW(document1, vocab))
    d2 = np.array(nBOW(document2, vocab))

    vocab_len = len(vocab)
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.float)
    for i, t1 in enumerate(vocab):
        for j, t2 in enumerate(vocab):
            if not t1 in document1 or not t2 in document2:
                # Only compute the distances that we need.
                continue
            # Compute Euclidean distance between word vectors.
            # TODO: this matrix is (and should be) symmetric, so we can save some computation here.
            # TODO: why not cosine distance?
            distance_matrix[i][j] = np.sqrt(np.sum((model[t1] - model[t2])**2))

    # Return WMD.
    return emd(d1, d2, distance_matrix)
Beispiel #28
0
def gaussian_emd(x, y, sigma=1.0, distance_scaling=1.0):
    ''' Gaussian kernel with squared distance in exponential term replaced by EMD
    Args:
      x, y: 1D pmf of two distributions with the same support
      sigma: standard deviation
    '''
    support_size = max(len(x), len(y))
    d_mat = toeplitz(range(support_size)).astype(np.float)
    distance_mat = d_mat / distance_scaling

    # convert histogram values x and y to float, and make them equal len
    x = x.astype(np.float)
    y = y.astype(np.float)
    if len(x) < len(y):
        x = np.hstack((x, [0.0] * (support_size - len(x))))
    elif len(y) < len(x):
        y = np.hstack((y, [0.0] * (support_size - len(y))))

    emd = pyemd.emd(x, y, distance_mat)
    return np.exp(-emd * emd / (2 * sigma * sigma))
Beispiel #29
0
def wmd(edu_vecs, i, j, D_embed):
    """Compute the Word Mover's Distance between two EDUs.

    Parameters
    ----------
    edu_vecs : sparse matrix
        One row per EDU.
    i : int
        Index of the first EDU.
    j : int
        Index of the second EDU.
    D_embed : dense matrix of np.double
        Distance matrix between each pair of word embeddings.

    Returns
    -------
    s : np.double
        Word Mover's Distance between EDUs i and j.

    Notes
    -----
    This function is an example implementation to compute the WMD
    on a pair of EDUs.
    You are however discouraged to use it as is if speed and memory matter.
    The recommended way is then to copy this function into your script,
    remove the parameters `edu_vecs` and `D_embed` from the signature of
    this function and have them point to variables from a wider scope
    (e.g. global variables from the module, even if they are defined in a
    conditional block such as `if __name__ == "__main__"`).
    This way, joblib.Parallel does not have to pickle parameters.
    An alternative course would be to memmap parameters as in the
    joblib.Parallel documentation, but it still runs an order of magnitude
    slower.
    """
    v_1 = edu_vecs[i].toarray().ravel()
    v_2 = edu_vecs[j].toarray().ravel()
    # NB: emd() has an additional named parameter: extra_mass_penalty
    # pyemd by default sets it to -1, i.e. the max value in the distance
    # matrix
    s = emd(v_1, v_2, D_embed)
    return s
def worker(antigen):
    print antigen
    atg_list = backbone_antigens+[antigen]
    A_dat= A.data[atg_list][A.erythroid_mask]
    B_dat = B.data[atg_list][B.erythroid_mask]
    A_cluster = kmeans_clustering(A_dat,n_clusters=clusters,n_jobs=kmeans_jobs)
    
    B_cluster = kmeans_clustering(B_dat,n_clusters=clusters,n_jobs=kmeans_jobs)
    
    distance_matrix = cdist(A_cluster.output[atg_list],
                            B_cluster.output[atg_list],
                            'cityblock')
    
    XA = A_cluster.output["cluster_size"].values.astype(np.double)
    XA = XA/np.sum(XA)
    XB = B_cluster.output["cluster_size"].values.astype(np.double)
    XB = XB/np.sum(XB)
    cost = emd(XA,XB,distance_matrix)

    print("{} has emd of {}".format(antigen,cost))
    return cost
Beispiel #31
0
def Wmd_Distance(src_seq, cur_sent, embed_mat):

    #src_seq=[index for index in src_seq if index not in stopwords_list]
    #cur_sent=[index for index in cur_sent if index not in stopwords_list]

    word_set = list(set(src_seq + cur_sent))
    vocab_len = len(word_set)
    vocab = [word for word in word_set]
    # 计算词之间的语义距离
    distance_matrix = np.zeros((vocab_len, vocab_len))
    for r in range(vocab_len):
        for c in range(vocab_len):
            distance_matrix[r][c] = compute_distance(vocab[r], vocab[c],
                                                     embed_mat)
    # 计算归一化的词频概率
    d1 = compute_normalized_word_freq(src_seq, word_set)
    d2 = compute_normalized_word_freq(cur_sent, word_set)

    # 计算词移距离
    wmd_distance = emd(d1, d2, distance_matrix)  # (0,1)内的值,越小越好
    return wmd_distance - 1
Beispiel #32
0
def gaussian_emd(x, y, sigma=1.0, distance_scaling=1.0):
    ''' 
    Gaussian kernel with squared distance in exponential term replaced by EMD
    Args:
      x, y: 1D pmf of two distributions with the same support
      sigma: standard deviation
    '''
    support_size = max(len(x), len(y))
    d_mat = toeplitz(range(support_size)).astype(np.float)
    distance_mat = d_mat / distance_scaling

    # convert histogram values x and y to float, and make them equal len
    x = x.astype(np.float)
    y = y.astype(np.float)
    if len(x) < len(y):
        x = np.hstack((x, [0.0] * (support_size - len(x))))
    elif len(y) < len(x):
        y = np.hstack((y, [0.0] * (support_size - len(y))))

    emd = pyemd.emd(x, y, distance_mat)
    return np.exp(-emd * emd / (2 * sigma * sigma))
def calculate_emd(input_distribution, output_distribution):
    '''
    Calculate Earth Mover's Distance (aka Wasserstein distance) between 
    two distributions of equal length.

    Parameters
    ----------
    input_distribution : numpy.ndarray
        Probabilities assigned to style classes for an input text
    output_distribution : numpy.ndarray
        Probabilities assigned to style classes for an output text, e.g. of a style transfer model
        
    Returns
    -------
    Earth Mover's Distance (float) between the two given style distributions

    '''

    N = len(input_distribution)
    distance_matrix = np.ones((N, N))
    return emd(input_distribution, output_distribution, distance_matrix)
Beispiel #34
0
def pos_distance(p1, p2, pos, P_, dat):
    p_1 = np.zeros(len(pos_word), dtype=np.double)
    p_2 = np.zeros(len(pos_word), dtype=np.double)
    p = {w: 0 for w in pos_word}
    k = 0
    for i in p1:
        w = i
        if pos[w] in p:
            p[pos[w]] += 1
            k += 1
    #print k
    for w in range(len(pos_word)):
        if pos_word[w] in p:
            p_1[w] = p[pos_word[w]]
    p = {w: 0 for w in pos_word}
    k = 0
    for i in p2:
        w = i
        if pos[w] in p:
            p[pos[w]] += 1
            k += 1
    #print k
    for w in range(len(pos_word)):
        if pos_word[w] in p:
            p_2[w] = p[pos_word[w]]
    #print p_1
    #print p_2
    p_1 = p_1.astype(np.double)
    p_2 = p_2.astype(np.double)
    if p_1.sum() != 0:
        p_11 = p_1 / p_1.sum()
    else:
        p_11 = p_1
    if p_2.sum() != 0:
        p_22 = p_2 / p_2.sum()
    else:
        p_22 = p_2
    #dat = 1
    pos_score = dat * emd(p_11, p_22, P_)
    return pos_score
Beispiel #35
0
def wmdistance(model, words1, words2, all_distances):
    dictionary = gensim.corpora.Dictionary(documents=[words1, words2])
    vocab_len = len(dictionary)

    # create bag of words from document
    def create_bow(doc):
        norm_bow = np.zeros(vocab_len, dtype=np.double)
        bow = dictionary.doc2bow(doc)

        for idx, count in bow:
            norm_bow[idx] = count / float(len(doc))

        return norm_bow

    bow1 = create_bow(words1)
    bow2 = create_bow(words2)

    docset = set(words2)
    distances = create_distance_matrix(model, dictionary, docset,
                                       all_distances)

    return emd(bow1, bow2, distances)
Beispiel #36
0
    def calc_wmd(self, doc1, doc2):
        vect = CountVectorizer().fit([doc1, doc2])
        vec1, vec2 = vect.transform([doc1, doc2])
        vec1 = vec1.toarray().ravel()
        vec2 = vec2.toarray().ravel()
        W = []
        for w in vect.get_feature_names():
            if w in self.word2vec.vocab:
                W.append(self.word2vec[w])
            else:
                W.append(np.random.uniform(-1.0, 1.0, self.w2v_len))

        D = euclidean_distances(W)

        # pyemd needs double precision input
        vec1 = vec1.astype(np.double)
        vec2 = vec2.astype(np.double)
        vec1 /= vec1.sum()
        vec2 /= vec2.sum()
        D = D.astype(np.double)
        D /= D.max()
        return emd(vec1, vec2, D)
Beispiel #37
0
def TopicDistance(topicA, topicB):
    """
    calculate the Word Mover's Distance between two topics considering each words frequency
    """

    # extract vocab
    vocab = set([w for w in topicA if w in wvmodel] +
                [w for w in topicB if w in wvmodel])

    # create nBOW for each topic
    nBOW_A = np.array([0 if w not in topicA else topicA[w] for w in vocab],
                      dtype=np.float64)
    nBOW_B = np.array([0 if w not in topicB else topicB[w] for w in vocab],
                      dtype=np.float64)
    # build embedding distance Matrix
    embeddings = [wvmodel[w] for w in vocab]
    D_Mat = squareform(pdist(embeddings, 'euclidean')).astype(np.float64)
    # solve earth movers distance
    wmd = emd(nBOW_A, nBOW_B, D_Mat)

    # return
    return wmd
Beispiel #38
0
def get_wmd_distance(d1, d2, min_vocab=7, verbose=False):
    vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.vocab and w not in stop_words.ENGLISH_STOP_WORDS]
    if len(vocabulary) < min_vocab:
        return 1
    vect = CountVectorizer(vocabulary=vocabulary, stop_words='english').fit([d1, d2])
    v_1, v_2 = vect.transform([d1, d2])
    W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
    D_ = euclidean_distances(W_)
    D_ = D_.astype(np.double)
    D_ /= D_.max()  # just for comparison purposes
    # v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    # pyemd needs double precision input
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    if verbose:
        print(vocabulary)
        print(v_1, v_2)
    return emd(v_1, v_2, D_)
Beispiel #39
0
def emd(x_vals, estimated_func_vals, true_func):
    assert (len(x_vals) == len(estimated_func_vals))

    n = len(x_vals)

    true_func_vals = np.zeros(n)
    for i, x in enumerate(x_vals):
        true_func_vals[i] = true_func(x)

    distance_matrix = np.zeros((n,n))

    for i in range(0, n):
        for j in range (0, n):
            distance_matrix[i][j] = abs(x_vals[i] - x_vals[j])

    # print("distance_matrix=\n{}".format(distance_matrix))

    estimated_func_vals = np.array(estimated_func_vals) # Convert list to np.array type

    emd = pyemd.emd(estimated_func_vals, true_func_vals, distance_matrix)   # Expensive computation

    return emd
Beispiel #40
0
def EMD_between_two_models_on_board(model1_name,
                                    input_plains_num_1,
                                    i1,
                                    model2_name,
                                    input_plains_num_2,
                                    i2,
                                    board1,
                                    board2,
                                    width=6,
                                    height=6,
                                    use_gpu=True):
    model_file_1 = f'/home/lirontyomkin/AlphaZero_Gomoku/models/{model1_name}/current_policy_{i1}.model'
    policy_1 = PolicyValueNet(width,
                              height,
                              model_file=model_file_1,
                              input_plains_num=input_plains_num_1,
                              use_gpu=use_gpu)

    model_file_2 = f'/home/lirontyomkin/AlphaZero_Gomoku/models/{model2_name}/current_policy_{i2}.model'
    policy_2 = PolicyValueNet(width,
                              height,
                              model_file=model_file_2,
                              input_plains_num=input_plains_num_2,
                              use_gpu=use_gpu)

    board_current_state1 = board1.current_state(last_move=True,
                                                is_random_last_turn=False)
    board_current_state2 = board2.current_state(last_move=True,
                                                is_random_last_turn=False)

    acts_policy1, probas_policy1 = zip(*policy_1.policy_value_fn(board1)[0])
    acts_policy2, probas_policy2 = zip(*policy_2.policy_value_fn(board2)[0])

    dist_matrix = generate_matrix_dist_metric(width)

    distance = emd(np.asarray(probas_policy1, dtype='float64'),
                   np.asarray(probas_policy2, dtype='float64'), dist_matrix)

    return distance
Beispiel #41
0
def perform_analysis(rv, fns, mn, mx):
    hist = []
    nbins = 128

    b = []
    for i, fn in enumerate(fns):
        img, mask = rv[i][1:3]
        if mask is not None:
            img = img[mask == 1]

        # 128 bins with a consistent range
        h, b = np.histogram(img, bins=nbins, range=(mn, mx))

        # Estimate the cumulative distribution function
        # Use cumulative sum to smooth the noise present in a histogram of quantized data
        cdf = np.cumsum(h * np.diff(b))

        # Normalize, removing effect of size
        cdf /= np.sum(cdf)

        hist += [cdf]

    rv2 = np.zeros([len(rv), len(rv)])

    bin_centers = (b[1:] + b[:-1]) / 2
    # w = euclidean_distances(np.arange(nbins).reshape(-1, 1), np.arange(nbins).reshape(-1, 1)) / (nbins - 1.)
    w = euclidean_distances(bin_centers.reshape(-1, 1),
                            bin_centers.reshape(-1, 1))

    for i, fn_i in enumerate(fns):
        print(f'{i + 1}/{len(fns)}...')
        a = hist[i]
        for j, fn_j in enumerate(fns):
            b = hist[j]
            v = emd(a, b, w)
            rv2[i, j] = v
    np.savez('emd_results.npz', rv2)
    # np.savetxt('emd_results.csv', rv2, delimiter=',')
    print('To plot the EMD results run ./old/plot_emd_mat.py')
    def points_best_cluster(self, centroids, dataPoint):
        """Takes the dataPoint and find the centroid index that it is closest too.

        Args:
          centroids: The list of centroids
          dataPoint: The dataPoint that is going to be determined which centroid it
            is closest too
        """
        closestCentroid = None
        leastDistance = None
        matrix = numpy.array([[0, 1 / 3.0, 2 / 3.0], [1 / 3.0, 0, 1 / 3.0],
                              [2 / 3.0, 1 / 3.0, 0]])

        for i in range(len(centroids)):
            distance = emd(numpy.array(dataPoint), numpy.array(centroids[i]),
                           matrix)
            #print(distance)
            if (leastDistance == None or distance < leastDistance):
                closestCentroid = i
                leastDistance = distance

        return closestCentroid
Beispiel #43
0
def WMDsimilarity(q1, q2):
    """Function to calculate the word mover distance between question 1 and queston 2
    inputs = q1 is question1 and q2 is question2
    """
    ## feed the questions to countvectorizer
    vect = CountVectorizer.fit([q1, q2])
    ## get the embeddings for words in the questions
    W_value = W[[vocab_dictionary[w] if w in vocab_dictionary.keys() else 0 for w in vect.get_feature_names() ]]
    data_vectors = euclidean_distances(W_value)

    ## transform the vectors of questions1 and 2
    vector_1, vector_2 = vect.transform([q1, q2])
    vector_1 = vector_1.toarray().ravel()
    vector_2 = vector_2.toarray().ravel()

    ## pyemd needs double precision input. hence converting to double precision
    vector_1 = vector_1.astype(np.double)
    vector_2 = vector_2.astype(np.double)
    vector_1 /= vector_1.sum()
    vector_2 /= vector_2.sum()
    data_vectors = data_vectors.astype(np.double)
    data_vectors /= data_vectors.max()  
    return emd(vector_1, vector_2, data_vectors)
Beispiel #44
0
    def solver_pyemd(self):
        while True:
            for s1_pos, s1_state in self.src_env.state2idx.items():
                a = self.src_agent.get_best_action(s1_state,
                                                   self.src_possible_actions)
                for s2_pos, s2_state in self.tgt_env.state2idx.items():
                    for b in range(self.action_space):
                        kd = emd(self.src_env.tp_matrix[s1_state, a],
                                 self.tgt_env.tp_matrix[s2_state, b],
                                 self.dist_matrix)  # pyemd
                        new_val = self.opts.discount_r * self.reward_matrix_tmp[
                            s1_state, a, s2_state,
                            b] + self.opts.discount_kd * kd
                        self.d[s1_state, 0, s2_state, b] = new_val
                    val = np.min(self.d[s1_state, 0, s2_state])
                    self.tmp_dist_matrix[s1_state, s2_state] = val

            if np.mean(np.abs(self.dist_matrix -
                              self.tmp_dist_matrix)) < self.opts.threshold:
                self.dist_matrix = self.tmp_dist_matrix.copy()
                break

            self.dist_matrix = self.tmp_dist_matrix.copy()
Beispiel #45
0
def compute_wasserstein_PDF(NNeigh, nR, R):
    # from:
    # https://github.com/wmayner/pyemd
    dPDF = []
    Ar = len(NNeigh[0])
    KF = 1.0 / float(Ar)
    """
	dmx = []
	for i in range(Ar):
		tmp = [0.0] * Ar
		tmp[i] = KF
		dmx.append(tmp)
	"""
    dmx = [None] * Ar
    for i in range(Ar):
        dmx[i] = [0.0] * Ar
    for i in range(Ar):
        for j in range(i + 1, Ar):
            dmx[i][j] = float(j - i) * KF
            dmx[j][i] = dmx[i][j]
    print "dmx = ", len(dmx), len(dmx[0])
    dmx = numpy.array(dmx)
    V = []
    for i in range(nR):
        tmp = numpy.array(NNeigh[i], dtype=numpy.float)
        V.append(tmp)
    print "V = ", len(V), len(V[0])
    for i in range(1, nR):
        print "i = ", i
        #P = numpy.array(NNeigh[i], dtype=numpy.float)
        #Q = numpy.array(NNeigh[i-1], dtype=numpy.float)
        #print "P = ", P, len(P)
        #print "Q = ", Q, len(Q)
        #D = emd(P, Q, dmx)
        D = emd(V[i], V[i - 1], dmx)
        dPDF.append([R[i], D])
    return dPDF
Beispiel #46
0
def wmd(i, j):
    """Compute the Word Mover's Distance between two EDUs.

    This presupposes the existence of two global variables:
    * `edu_vecs` is a sparse 2-dimensional ndarray where each row
    corresponds to the vector representation of an EDU,
    * `D_common` is a dense 2-dimensional ndarray that contains
    the euclidean distance between each pair of word embeddings.

    Parameters
    ----------
    i : int
        Index of the first EDU.
    j : int
        Index of the second EDU.

    Returns
    -------
    s : np.double
        Word Mover's Distance between EDUs i and j.
    """
    # EMD is extremely sensitive on the number of dimensions it has to
    # work with ; keep only the dimensions where at least one of the
    # two vectors is != 0
    union_idx = np.union1d(edu_vecs[i].indices, edu_vecs[j].indices)
    # EMD segfaults on incorrect parameters:
    # * if both vectors (and thus the distance matrix) are all zeros,
    # return 0.0 (consider they are the same)
    if not np.any(union_idx):
        return 0.0
    D_minimal = D_common[np.ix_(union_idx, union_idx)]
    bow_i = edu_vecs[i, union_idx].A.ravel()
    bow_j = edu_vecs[j, union_idx].A.ravel()
    # NB: emd() has an additional named parameter: extra_mass_penalty
    # pyemd by default sets it to -1, i.e. the max value in the distance
    # matrix
    return emd(bow_i, bow_j, D_minimal)
Beispiel #47
0
def wasserstein_distance(checkpoint_1, checkpoint_2):
    """Calculates the Wassterstein ("Earth Mover's") distance between the
    fixed points of 2 different checkpoints.

    Checkpoints must be separately analyzed to have fixed points and cluster
    means computed."""

    cluster_means_1 = checkpoint_1['cluster_means']
    cluster_labels_1 = checkpoint_1['cluster_labels']
    cluster_weights_1 = []
    for j in range(cluster_means_1.shape[0]):
        cluster_weights_1.append(len(np.where(cluster_labels_1 == j)[0]))
    cluster_weights_1 = np.array(cluster_weights_1)

    cluster_means_2 = checkpoint_2['cluster_means']
    cluster_labels_2 = checkpoint_2['cluster_labels']
    cluster_weights_2 = []
    for j in range(cluster_means_2.shape[0]):
        cluster_weights_2.append(len(np.where(cluster_labels_2 == j)[0]))
    cluster_weights_2 = np.array(cluster_weights_2)

    hist1 = np.concatenate(
        [cluster_weights_1,
         np.zeros_like(cluster_weights_2)], axis=0).astype(np.float64)
    hist2 = np.concatenate(
        [np.zeros_like(cluster_weights_1), cluster_weights_2],
        axis=0).astype(np.float64)
    N = len(cluster_weights_1) + len(cluster_weights_2)

    combined_means = np.concatenate([cluster_means_1, cluster_means_2], axis=0)

    distances = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            distances[i, j] = norm(combined_means[i] - combined_means[j])

    return emd(hist1, hist2, distances)
Beispiel #48
0
def emd(p1,p2,dmat):
    """
    Compute the sparsing Earth-Mover distance between two descreted distributio
    -ns: p1, p2 with the metric described by: dmat
    Our sparsing algorithm is:
        Use those points on which p1 + p2 value is greater than tol (1e-6 defau
        -lt) to construct two new distributions and compute their emd. The reas
        -on is that the emd will only depends on those points on which p1, p2 a
        -re not too small.

    Input:
        p1: 1d non-negative np.array of size (N,) of a flattened distribution
        p2: 1d non-negative np.array of size (N,) of a flattened distribution
        dmat: np.array of size (N,N), dmat[i,j] is the distance between the i-th
              and j-th points

    Output:
        The Earth-Mover distance between p1, p2 with metric dmat
    """
    msk = (p1+p2) >tol
    usdp1 = p1[msk]
    usdp2 = p2[msk]
    usdd = np.transpose(dmat[msk])[msk]
    return pyemd.emd(usdp1,usdp2,usdd)
Beispiel #49
0
def quantile_emd(column1: CorrelationClusteringColumn,
                 column2: CorrelationClusteringColumn,
                 quantiles: int = 256):
    """
    Computes the Earth Mover's Distance (EMD) over two column quantile histograms

    If the argument `quantiles` isn't passed in, the default of the paper
    "Automatic Discovery of Attributes in Relational Databases" is used which is 256.

    Parameters
    ---------
    column1 : Column
        The first column
    column2 : Column
        The second column that we create its quantile histogram by doing a linear scan over the first's
    quantiles: int, optional
        The number of quantiles that the histograms are split on (default is 256)

    Returns
    -------
    float
        the EMD value between column1 and column2
    """
    if column1.size == 0 or column2.size == 0:
        return math.inf

    histogram1 = column1.get_histogram()
    histogram2 = QuantileHistogram(column2.long_name,
                                   column2.ranks,
                                   column2.size,
                                   quantiles,
                                   reference_hist=histogram1)
    if histogram2.is_empty:
        return math.inf
    return emd(histogram1.get_values, histogram2.get_values,
               histogram1.dist_matrix)
Beispiel #50
0
def component_merge_emd(net1, net2, metric_space):
    netx1 = nx.from_scipy_sparse_matrix(net1.adjacency_matrix)
    netx2 = nx.from_scipy_sparse_matrix(net2.adjacency_matrix)

    cc1 = list(nx.connected_components(netx1))
    cc2 = list(nx.connected_components(netx2))

    net1 = build_induced_graph(net1, cc1, metric_space)
    net2 = build_induced_graph(net2, cc2, metric_space)

    a1 = np.diag(net1.adjacency_matrix.toarray())
    a1 = a1 / a1.sum()
    a2 = np.diag(net2.adjacency_matrix.toarray())
    a2 = a2 / a2.sum()

    a1aug = np.concatenate([a1, np.zeros_like(a2)])
    a2aug = np.concatenate([np.zeros_like(a1), a2])

    dists = network_merge_distance(net1, net2, metric_space)

    a = np.hstack([np.zeros((a1.shape[0], a1.shape[0])), dists])
    b = np.hstack([dists.T, np.zeros((a2.shape[0], a2.shape[0]))])
    c = np.vstack([a, b])
    return pyemd.emd(a1aug, a2aug, c)
Beispiel #51
0
    def solver_pyemd(self):
        while True:
            for s1_pos, s1_state in self.src_env.state2idx.items():
                for s2_pos, s2_state in sorted(self.tgt_env.state2idx.items()):
                    for a in range(self.action_space):
                        for b in range(self.action_space):
                            kd = emd(self.src_env.tp_matrix[s1_state, a],
                                     self.tgt_env.tp_matrix[s2_state, b],
                                     self.dist_matrix)  # pyemd
                            new_val = self.opts.discount_r * self.reward_matrix_tmp[
                                s1_state, a, s2_state,
                                b] + self.opts.discount_kd * kd
                            self.d[s1_state, a, s2_state, b] = new_val
                    d_st = self.d[s1_state, :, s2_state, :]
                    val = max(np.max(np.min(d_st, axis=1)),
                              np.max(np.min(d_st, axis=0)))
                    self.tmp_dist_matrix[s1_state, s2_state] = val

            if np.mean(np.abs(self.dist_matrix -
                              self.tmp_dist_matrix)) < self.opts.threshold:
                self.dist_matrix = self.tmp_dist_matrix.copy()
                break

            self.dist_matrix = self.tmp_dist_matrix.copy()
Beispiel #52
0
	def wmd(self, sent1="this is sentence", sent2="this is sentence") -> float:
		'''
			Calculates the word movers distance between two
			sentences.

			sent1,sent2: Two input sentences in text form
		'''
		sp1 = self._sent_to_sparse(sent1)
		sp2 = self._sent_to_sparse(sent2)
		if sp1 is None or sp2 is None:
			return 
		union_idx = np.union1d(sp1.indices, sp2.indices)

		W = sklearn.metrics.euclidean_distances(self.embedding[union_idx])
		W = W.astype("float64")	

		sp1 = sp1[:, union_idx].A.ravel()
		sp2 = sp2[:, union_idx].A.ravel()
		print(sp1,sp2)
		#import sys
		#def hook(f, *_):
		#	print(f.f_code.co_name)
		#sys.setprofile(hook)
		return pyemd.emd(sp1, sp2, W)
Beispiel #53
0
def worker(antigen):
    print antigen
    atg_list = backbone_antigens + [antigen]
    A_dat = A.data[atg_list][A.erythroid_mask]
    B_dat = B.data[atg_list][B.erythroid_mask]
    A_cluster = kmeans_clustering(A_dat,
                                  n_clusters=clusters,
                                  n_jobs=kmeans_jobs)

    B_cluster = kmeans_clustering(B_dat,
                                  n_clusters=clusters,
                                  n_jobs=kmeans_jobs)

    distance_matrix = cdist(A_cluster.output[atg_list],
                            B_cluster.output[atg_list], 'cityblock')

    XA = A_cluster.output["cluster_size"].values.astype(np.double)
    XA = XA / np.sum(XA)
    XB = B_cluster.output["cluster_size"].values.astype(np.double)
    XB = XB / np.sum(XB)
    cost = emd(XA, XB, distance_matrix)

    print("{} has emd of {}".format(antigen, cost))
    return cost
Beispiel #54
0
        raise AnnoyingError('F**k this')

    H1, _, _ = np.histogram2d(*D1.T, bins=(bx, by))
    H2, _, _ = np.histogram2d(*D2.T, bins=(bx, by))

    H1 /= H1.sum()
    H2 /= H2.sum()

    _x, _y = np.indices(H1.shape)

    coords = np.array(zip(_x.ravel(), _y.ravel()))

    D = distance(coords, coords)

    return emd(H1.ravel(), H2.ravel(), D)


def _calculate_emd_1D(D1, D2, bins=40):
    """
    Args:
    -----
        D1, D2: two np arrays with potentially differing 
            numbers of rows, but two columns. The empirical 
            distributions you want a similarity over
        bins: number of bins in each dim
    """

    D1 = D1[np.isnan(D1).sum(axis=-1) < 1]
    D2 = D2[np.isnan(D2).sum(axis=-1) < 1]
def word_mover_score(ngram, refs, hyps, batch_size=256, device='cuda:0'):
    idf_dict_ref = defaultdict(lambda: 1.)
    idf_dict_hyp = defaultdict(lambda: 1.)
    preds = []
    for batch_start in range(0, len(refs), batch_size):
        batch_refs = refs[batch_start:batch_start + batch_size]
        batch_hyps = hyps[batch_start:batch_start + batch_size]

        ref_embedding, ref_lens, ref_masks, ref_idf, ref_tokens = get_bert_embedding(
            batch_refs, model, tokenizer, idf_dict_ref, device=device)
        hyp_embedding, hyp_lens, hyp_masks, hyp_idf, hyp_tokens = get_bert_embedding(
            batch_hyps, model, tokenizer, idf_dict_hyp, device=device)

        ref_embedding.div_(torch.norm(ref_embedding, dim=-1).unsqueeze(-1))
        hyp_embedding.div_(torch.norm(hyp_embedding, dim=-1).unsqueeze(-1))

        ref_embedding_max, _ = torch.max(ref_embedding[-5:], dim=0, out=None)
        hyp_embedding_max, _ = torch.max(hyp_embedding[-5:], dim=0, out=None)

        ref_embedding_min, _ = torch.min(ref_embedding[-5:], dim=0, out=None)
        hyp_embedding_min, _ = torch.min(hyp_embedding[-5:], dim=0, out=None)

        ref_embedding_avg = ref_embedding[-5:].mean(0)
        hyp_embedding_avg = hyp_embedding[-5:].mean(0)

        ref_embedding = torch.cat(
            [ref_embedding_min, ref_embedding_avg, ref_embedding_max], -1)
        hyp_embedding = torch.cat(
            [hyp_embedding_min, hyp_embedding_avg, hyp_embedding_max], -1)

        num_refs = len(ref_embedding)

        for i in range(num_refs):
            ref_ids = range(0, len(ref_tokens[i]))
            hyp_ids = range(0, len(hyp_tokens[i]))
            ref_embedding_i, ref_idf_i = load_ngram(ref_ids, ref_embedding[i],
                                                    ref_idf[i], ngram, 1)
            hyp_embedding_i, hyp_idf_i = load_ngram(hyp_ids, hyp_embedding[i],
                                                    hyp_idf[i], ngram, 1)

            raw = torch.cat([ref_embedding_i, hyp_embedding_i], 0)
            raw.div_(torch.norm(raw, dim=-1).unsqueeze(-1) + 0.000001)

            distance_matrix = pairwise_distances(raw, raw)

            c1 = np.zeros(len(ref_idf_i) + len(hyp_idf_i), dtype=np.double)
            c2 = np.zeros(len(ref_idf_i) + len(hyp_idf_i), dtype=np.double)

            c1[:len(ref_idf_i)] = ref_idf_i
            c2[-len(hyp_idf_i):] = hyp_idf_i

            c1 = _safe_divide(c1, np.sum(c1))
            c2 = _safe_divide(c2, np.sum(c2))
            score = 1 - emd(c1, c2, distance_matrix.double().cpu().numpy())
            preds.append(score)
    return preds


#system = ['This is test summary']
#references = ['This is ref summary two','this is test summary']
#score = word_mover_score(2, references, system * len(references))
#print(score)
Beispiel #56
0
def myfun1(pair, datai, dataj, W_dist):
    value = pyemd.emd(datai.values, dataj.values, W_dist)
    return (value, pair) 
        f_t = np.load(feature_dir + '%s.npy' % td)
        w_s = np.load(feature_dir + '%s_weight.npy' % sd)
        w_t = np.load(feature_dir + '%s_weight.npy' % td)

        f_s = f_s[idx, :]
        w_s = w_s[idx]

        # Make sure two histograms have the same length and distance matrix is square.
        data = np.float64(np.append(f_s, f_t, axis=0))
        w_1 = np.zeros((len(w_s) + len(w_t), ), np.float64)
        w_2 = np.zeros((len(w_s) + len(w_t), ), np.float64)
        w_1[:len(w_s)] = w_s / np.sum(w_s)
        w_2[len(w_s):] = w_t / np.sum(w_t)
        D = euclidean_distances(data, data)

        emd = pyemd.emd(np.float64(w_1), np.float64(w_2), np.float64(D))
        domain_similarity = np.exp(-gamma * emd)
        similarity_matrix[sd][td] = domain_similarity
        print('EMD: %.3f    Domain Similarity: %.3f\n' %
              (emd, domain_similarity))
np.save('cui_similarity_mat.npy', similarity_matrix)
print('Elapsed time: %.3fs' % (time.time() - tic))
np.fill_diagonal(similarity_matrix, np.nan)


def draw_figure_to_plt(distance_matrix, names, label_size=14):
    fig = plt.figure(figsize=(15 / 25. * len(names), 15 / 25. * len(names)))
    ax = plt.gca()

    plt.imshow(distance_matrix, cmap='viridis_r')
    ax.set_xticks(np.arange(len(names)))
Beispiel #58
0
def two_sentence_dis(sentence1, sentence2):
    if not model_dic.has_key('embedding'):
        with open(BASE_DIR + 'embedding.pkl', 'rb') as vocab:
            model_dic['embedding'] = pickle.load(vocab)

    len_sentence1 = len(sentence1)
    len_sentence2 = len(sentence2)

    # Remove out-of-vocabulary words.
    sentence1 = [
        model_dic['word_index'].get(token) for token in sentence1
        if model_dic['word_index'].has_key(token)
    ]
    sentence2 = [
        model_dic['word_index'].get(token) for token in sentence2
        if model_dic['word_index'].has_key(token)
    ]

    diff1 = len_sentence1 - len(sentence1)
    diff2 = len_sentence2 - len(sentence2)
    if diff1 > 0 or diff2 > 0:
        print(
            'Removed %d and %d OOV words from document 1 and 2 (respectively).',
            diff1, diff2)

    if len(sentence1) == 0 or len(sentence2) == 0:
        print('At least one of the documents had no words that were'
              'in the vocabulary. Aborting (returning inf).')
        return float('inf')

    dictionary_temp = list(set(sentence1 + sentence2))
    dictionary = dict(enumerate(dictionary_temp))
    vocab_len = len(dictionary)

    sen_set1 = set(sentence1)
    sen_set2 = set(sentence2)

    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if not t1 in sen_set1 or not t2 in sen_set2:
                continue
            # 计算距离
            distance_matrix[i, j] = sqrt(
                np_sum((model_dic['embedding'][t1] -
                        model_dic['embedding'][t2])**2))

    if np_sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        print('The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')

    def doc2bow(document, dictionary):
        freq_dic = dict()
        for i in document:
            if freq_dic.has_key(i):
                freq_dic[i] = freq_dic[i] + 1
            else:
                freq_dic[i] = 1

        return_freq = dict()
        for i in range(len(document)):
            if return_freq.has_key(i):
                for key in range(len(dictionary)):
                    if dictionary[key] == document[i]:
                        return_freq[key] = freq_dic[document[i]]
            else:
                for key in range(len(dictionary)):
                    if dictionary[key] == document[i]:
                        return_freq[key] = freq_dic[document[i]]
        return return_freq

    def nbow(document):
        d = zeros(vocab_len, dtype=double)
        nbow = doc2bow(document, dictionary)  # Word frequencies.
        doc_len = len(document)
        for (idx, freq) in nbow.items():
            #for idx, freq in nbow:
            d[idx] = float(freq) / float(
                doc_len)  # Normalized word frequencies.
        return d

    # Compute nBOW representation of documents.
    d1 = nbow(sentence1)
    d2 = nbow(sentence2)

    # Compute WMD.
    #print pyemd.emd(d1, d2, distance_matrix)
    return pyemd.emd(d1, d2, distance_matrix)
Beispiel #59
0
    d1 = f.read().replace('\n', '')

with open('/home/avinash/Downloads/autonomous_soft_robot.txt') as f:

    d2 = f.read().replace('\n', '')
#d2 = "The President addresses the press in Chicago"

vect = CountVectorizer(stop_words="english").fit([d1, d2])
print("Features:", ", ".join(vect.get_feature_names()))

v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()

print(v_1, v_2)
print("cosine(doc_1, doc_2) = {:.2f}".format(cosine(v_1, v_2)))

W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]]
D_ = euclidean_distances(W_)

# pyemd needs double precision input
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
D_ = D_.astype(np.double)
D_ /= D_.max()  # just for comparison purposes

print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))
Beispiel #60
0
def compute_d(least_fixed_iters=10, threshold=0.00001, emd_func='cv2', use_manhattan_as_d=False):
    """
    Computes state-action and state-state bisimulation metric

    Parameters: 
    least_fixed_iters (int): Number of iterations of random init and solving
    threshold (float): Threshold value for stopping the solver for distance matrix
    emd (str): Specify which function to use for calculating the Earth Mover's 
                distance or Wasserstein distance or Kantorovich metric
                Options: ['scipy', 'cv2', 'opt', 'pyemd']
    use_manhattan_as_d (bool): If True, use manhattan distance as distance matrix
    
  
    Returns: 
    d_final: Bisimulation state-action metric with dim (S1 x a x S2 x b)
    dist_matrix_final: Bisimulation state-state metric with dim (S1 x S2)
    """

    print ("EMD computed using: ", emd_func)
    print ("Number of lfp iterations: ", least_fixed_iters)
    print ("Threshold value: ", threshold)
    print ("Source size: ", src_state_space, " Target size: ", tgt_state_space)

    if emd_func == 'pyemd':
        reward_matrix_tmp = np.zeros((src_state_space, action_space, tgt_state_space, action_space))
        reward_matrix = np.zeros((src_state_space, tgt_state_space))
        dist_matrix_final = np.zeros((src_state_space, tgt_state_space))
        d_final = np.zeros((src_state_space, action_space, tgt_state_space, action_space))
    else:
        reward_matrix_tmp = np.zeros((src_state_space, action_space, tgt_state_space, action_space)).astype(np.float32)
        reward_matrix = np.zeros((src_state_space, tgt_state_space)).astype(np.float32)
        src_env.tp_matrix = src_env.tp_matrix.astype(np.float32)
        tgt_env.tp_matrix = tgt_env.tp_matrix.astype(np.float32)
        dist_matrix_final = np.zeros((src_state_space, tgt_state_space)).astype(np.float32)
        d_final = np.zeros((src_state_space, action_space, tgt_state_space, action_space)).astype(np.float32)

    for s1_pos, s1_state in src_env.state2idx.items():
        src_env.position = s1_pos
        src_env.start_position = s1_pos
        for s2_pos, s2_state in tgt_env.state2idx.items():
            tgt_env.position = s2_pos
            tgt_env.start_position = s2_pos
            for a in range(action_space):
                next_state, reward_a, done, next_possible_states = src_env.step(a)
                src_env.start_position = s1_pos
                src_env.position = s1_pos
                for b in range(action_space):
                    next_state, reward_b, done, next_possible_states = tgt_env.step(b)
                    reward_matrix_tmp[s1_state, a, s2_state, b] = math.fabs(reward_a - reward_b)
                    tgt_env.start_position = s2_pos
                    tgt_env.position = s2_pos
    
    for s1_pos, s1_state in src_env.state2idx.items():
        for s2_pos, s2_state in tgt_env.state2idx.items():
            reward_matrix[s1_state, s2_state] = np.max(reward_matrix_tmp[s1_state,:,s2_state,:])

    # Supply Manhattan distance as an alternative to reward distance for calculation of EMD
    # DO NOT USE when S1 and S2 are of different sizes
    manhattan_distance = np.zeros((src_env.state_space, tgt_env.state_space))
    for s1_pos, s1_state in src_env.state2idx.items():
        for s2_pos, s2_state in tgt_env.state2idx.items():
            manhattan_distance[s1_state, s2_state] = distance.cityblock(s1_pos, s2_pos)

    dist_matrix_final.fill(1000.0)
    d_final.fill(1000.0)
    if emd_func == 'pyemd':
        d = np.zeros((src_state_space, action_space, tgt_state_space, action_space))
        dist_matrix = np.zeros((src_state_space, tgt_state_space))
        dist_matrix.fill(0.01)
        tmp_dist_matrix = np.zeros((src_state_space, tgt_state_space))
    else:
        d = np.zeros((src_state_space, action_space, tgt_state_space, action_space)).astype(np.float32)
        dist_matrix = np.random.rand(src_state_space, tgt_state_space).astype(np.float32)
        tmp_dist_matrix = np.random.rand(src_state_space, tgt_state_space).astype(np.float32)

    for i in range(least_fixed_iters):
        print ("Iteration: ", i, "/", least_fixed_iters, " Loss: ", np.mean(np.abs(dist_matrix_final - dist_matrix)))
        dist_matrix_final = dist_matrix.copy()
        ctr = 0
        while True:
            for s1_pos, s1_state in src_env.state2idx.items():
                for s2_pos, s2_state in sorted(tgt_env.state2idx.items()):
                    for a in range(action_space):
                        for b in range(action_space):
                            kd = emd(src_env.tp_matrix[s1_state,a], tgt_env.tp_matrix[s2_state,b], dist_matrix) # pyemd
                            new_val = 0.1 * reward_matrix_tmp[s1_state, a, s2_state, b] + 0.9 * kd
                            d[s1_state, a, s2_state, b] = new_val
                            d_st = d[s1_state, :, s2_state, :]
                    val = max(np.max(np.min(d_st, axis=1)), np.max(np.min(d_st, axis=0)))
                    tmp_dist_matrix[s1_state, s2_state] = val

            if np.mean(np.abs(dist_matrix - tmp_dist_matrix)) < threshold:
                dist_matrix = tmp_dist_matrix.copy()
                break
            
            dist_matrix = tmp_dist_matrix.copy()

    dist_matrix_final = dist_matrix.copy()
    d_final = d.copy()

    return d_final, dist_matrix_final