Esempio n. 1
0
def assign(x1,x2,c1,c2):
    r = ed(x1,c1)
    r2 = ed(x2,c2)
    r3=np.concatenate((r,r2), axis=1)
    z = np.argmin(r3, axis = 1)
    res = np.array( [zz if zz < c1.shape[0] else zz-c1.shape[0]  for zz in z] )


    return res,  np.argmin(r, axis = 1) !=  np.argmin(r2, axis = 1)
Esempio n. 2
0
        def get_similarity_values(q1_csc, q2_csc):
            cosine_sim = []
            manhattan_dis = []
            eucledian_dis = []
            jaccard_dis = []
            minkowsk_dis = []

            for i, j in zip(q1_csc, q2_csc):
                sim = cs(i, j)
                cosine_sim.append(sim[0][0])
                sim = md(i, j)
                manhattan_dis.append(sim[0][0])
                sim = ed(i, j)
                eucledian_dis.append(sim[0][0])
                i_ = i.toarray()
                j_ = j.toarray()
                try:
                    sim = jsc(i_, j_)
                    jaccard_dis.append(sim)
                except:
                    jaccard_dis.append(0)

                sim = minkowski_dis.pairwise(i_, j_)
                minkowsk_dis.append(sim[0][0])
            return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
Esempio n. 3
0
def compare_heatmap(y11, y12, y21, y22, mata, matb):

    from lapsolver import solve_dense
    from sklearn.metrics.pairwise import euclidean_distances as ed
    from natto.process.hungutil import make_canvas_and_spacemaps
    from natto.out.draw import quickdoubleheatmap
    distances = ed(mata, matb)
    hungmatch = solve_dense(distances)

    def prephtmap(y1, y2):
        # returns: canvas,y1map2,y2map2,row,col
        a, b, c = make_canvas_and_spacemaps(y1, y2, hungmatch, normalize=False)
        d, e = solve_dense(c)
        return c, a, b, d, e

    comp1 = prephtmap(y11, y12)
    comp2 = prephtmap(y21, y22)

    quickdoubleheatmap(comp1, comp2)

    def calcmissmatches(stuff):
        canv = stuff[0]
        r, c = stuff[-2:]
        for rr, cc in zip(r, c):
            canv[rr, cc] = 0
        return canv.sum()

    print("clust1 missplaced:", calcmissmatches(comp1))
    print("clust2 missplaced:", calcmissmatches(comp2))
    print("set1 randindex:", rand(y11, y21))
    print("set2 randindex:", rand(y12, y22))
Esempio n. 4
0
def euclidean_distance(w1, w2, wv):
    v1 = vector_getter(w1, wv)
    v2 = vector_getter(w2, wv)
    distance = None
    # if v1 and v2 are vectors of dimension 300
    if type(v1) != int and type(v2) != int:
        distance = ed(v1, v2)[0][0]
    return distance
Esempio n. 5
0
def get_similarity_values(res_csc, jd_csc):
    cosine_sim = []
    manhattan_dis = []
    eucledian_dis = []
    
    j= jd_csc
    for i in res_csc:
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
        sim = md(i,j)
        manhattan_dis.append(sim[0][0])
        sim = ed(i,j)
        eucledian_dis.append(sim[0][0])
        
    return cosine_sim, manhattan_dis, eucledian_dis  
Esempio n. 6
0
        def bm25_dist(row, dist_type, bm25_model, average_idf, feature_dim):
            assert dist_type in ['cs', 'ed', 'md'], 'dist type error'
            q1 = row['q1_w'].split()
            q2 = row['q2_w'].split()
            q1_bm25 = bm25_model.get_scores(q1, average_idf)
            q2_bm25 = bm25_model.get_scores(q2, average_idf)
            q1_bm25 = np.reshape(np.array(q1_bm25), (-1, feature_dim))
            q2_bm25 = np.reshape(np.array(q2_bm25), (-1, feature_dim))

            if dist_type == 'cs':
                score = cs(q1_bm25, q2_bm25).flatten()[0]
            elif dist_type == 'ed':
                score = ed(q1_bm25, q2_bm25).flatten()[0]
            elif dist_type == 'md':
                score = md(q1_bm25, q2_bm25).flatten()[0]
            return score
def kmeans(features, k, num_iters=100):
    """ Use kmeans algorithm to group features into k clusters.
    
    K-Means algorithm can be broken down into following steps:
        1. Randomly initialize cluster centers
        2. Assign each point to the closest center
        3. Compute new center of each cluster
        4. Stop if cluster assignments did not change
        5. Go to step 2

    Args:
        features - Array of N features vectors. Each row represents a feature
                vector.
        k - Number of clusters to form.
        num_iters - Maximum number of iterations the algorithm will run.

    Returns:
        assignments - Array representing cluster assignment of each point.
            (e.g. i-th point is assigned to cluster assignments[i])
    """
    N, D = features.shape
    assert N >= k, 'Number of clusters cannot be greater than number of points'

    # Randomly initalize cluster centers
    idxs = np.random.choice(N, size=k, replace=False)
    centers = features[idxs]
    assignments = np.zeros(N)

    for n in range(num_iters):
        ### YOUR CODE HERE
        matrix = ed(features, centers)

        tmp = np.argmin(matrix, axis=1)

        if np.all(tmp == assignments):
            break
        assignments = tmp
        new_centers = np.zeros_like(centers)
        for i in range(k):
            assigned_i = features[assignments == i]
            new_centers[i] = np.mean(assigned_i, axis=0)
            #m = ed(assigned_i, assigned_i)
            #new_centers[i] = assigned_i[np.argmin(np.sum(m, axis = 0))]

        centers = new_centers
        ### END YOUR CODE
    return assignments
Esempio n. 8
0
    def extract_tfidf_feature(self, df):
        q1_w_vec = self.tfidf_vectorizer.transform(df['q1_w'].values.tolist())
        q2_w_vec = self.tfidf_vectorizer.transform(df['q2_w'].values.tolist())

        df['tfidf_cs'] = np.concatenate([
            cs(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])
        df['tfidf_ed'] = np.concatenate([
            ed(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])
        df['tfidf_md'] = np.concatenate([
            md(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])

        corpus_tfidf = np.concatenate(
            [q1_w_vec.toarray(), q2_w_vec.toarray()], axis=0)

        svd_model = TruncatedSVD(n_components=5)
        svd_model.fit(corpus_tfidf)

        svd_topic = svd_model.transform(corpus_tfidf)
        q1_w_svd_feature = svd_topic[:q1_w_vec.shape[0]]
        q2_w_svd_feature = svd_topic[q1_w_vec.shape[0]:]

        df['svd_cs'] = np.concatenate([
            cs(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])
        df['svd_ed'] = np.concatenate([
            ed(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])
        df['svd_md'] = np.concatenate([
            md(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])

        lda_model = LatentDirichletAllocation(n_components=5, random_state=0)
        lda_model.fit(corpus_tfidf)

        lda_topic = lda_model.transform(corpus_tfidf)

        q1_w_lda_feature = lda_topic[:q1_w_vec.shape[0]]
        q2_w_lda_feature = lda_topic[q1_w_vec.shape[0]:]

        df['lda_cs'] = np.concatenate([
            cs(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
        df['lda_ed'] = np.concatenate([
            ed(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
        df['lda_md'] = np.concatenate([
            md(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
Esempio n. 9
0
    def similarity_matrix(self, sim='cos'):
        features = self.features
        if features is None:
            return None

        if sim == 'dot':
            sim = np.dot(features, features.T)
        elif sim == 'cos':
            norm = np.linalg.norm(features, axis=1)[np.newaxis]
            sim = np.dot(features, features.T) / np.dot(norm.T, norm)
        elif sim == 'kmeans':
            cluster = kmeans(features, K=2)[np.newaxis]
            cluster[cluster == 0] = -1
            sim = np.dot(cluster.T, cluster)
        elif sim == 'comm':
            N = len(self.clusters)
            #sim = np.repeat(np.array(self.clusters)[np.newaxis], N, 0)
            theta, _ = self.get_params()
            sim = theta.dot(theta.T)
            sim = (sim == sim.T) * 1
            sim[sim < 1] = -1
        elif sim == 'euclide_old':
            from sklearn.metrics.pairwise import euclidean_distances as ed
            #from plot import kmeans_plus
            #kmeans_plus(features, K=4)
            print(features)
            dist = ed(features)
            K = self.parameters_['k']
            devs = self.parameters_['devs'][0]
            sim = np.zeros(dist.shape)
            sim[dist <= 2.0 * devs / K] = 1
            sim[dist > 2.0 * devs / K] = -1
        elif sim == 'euclide_abs':
            from sklearn.metrics.pairwise import euclidean_distances as ed
            #from plot import kmeans_plus
            #kmeans_plus(features, K=4)
            N = len(features)
            K = self.parameters_['k']
            devs = self.parameters_['devs'][0]

            a = np.repeat(features[:, 0][None], N, 0).T
            b = np.repeat(features[:, 0][None], N, 0)
            sim1 = np.abs(a - b)
            a = np.repeat(features[:, 1][None], N, 0).T
            b = np.repeat(features[:, 1][None], N, 0)
            sim2 = np.abs(a - b)

            sim3 = np.zeros((N, N))
            sim3[sim1 <= 2.0 * devs / K] = 1
            sim3[sim1 > 2.0 * devs / K] = -1
            sim4 = np.zeros((N, N))
            sim4[sim2 <= 2.0 * devs / K] = 1
            sim4[sim2 > 2.0 * devs / K] = -1
            sim = sim4 + sim3
            sim[sim >= 0] = 1
            sim[sim < 0] = -1

        elif sim == 'euclide_dist':
            from sklearn.metrics.pairwise import euclidean_distances as ed
            #from plot import kmeans_plus
            #kmeans_plus(features, K=4)
            N = len(features)
            K = self.parameters_['k']
            devs = self.parameters_['devs'][0]

            sim1 = ed(np.repeat(features[:, 0][None], 2, 0).T)
            sim2 = ed(np.repeat(features[:, 0][None], 2, 0).T)

            sim3 = np.zeros((N, N))
            sim3[sim1 <= 2.0 * devs / K] = 1
            sim3[sim1 > 2.0 * devs / K] = -1
            sim4 = np.zeros((N, N))
            sim4[sim2 <= 2.0 * devs / K] = 1
            sim4[sim2 > 2.0 * devs / K] = -1
            sim = sim4 + sim3
            sim[sim >= 0] = 1
            sim[sim < 0] = -1
        return sim
Esempio n. 10
0
import numpy as np
import cPickle as pickle

from sklearn.metrics.pairwise import euclidean_distances as ed

from tqdm import *

featureDic = {}
with open('item_feature.dat') as f:
    featureDic = pickle.load(f)
y = []
pred_y = []
dataDir = '/home/zhaokui/research/KDD/data/taobao/'
testData = open(dataDir + 'pro_test_set.txt').readlines()
i = 0
for item in featureDic:
    print(featureDic[item])
    i += 1
    if i == 10:
        exit()
for line in tqdm(testData):
    tmp = line.split()
    itemA = tmp[0].split('/')[1].split('.')[0]
    itemB = tmp[1].split('/')[1].split('.')[0]
    #print(featureDic[itemA], featureDic[itemB])
    print(ed(featureDic[itemA], featureDic[itemB]))
    i += 1
    if i == 10:
        break
Esempio n. 11
0
def my_euclidean_distance(word1, word2, wv):
    distance = ed(my_vector_getter(word1, wv), my_vector_getter(word2, wv))
    return (round(distance, 4))
Esempio n. 12
0
itemDir = '/home/zhaokui/research/KDD/data/taobao/'
itemData = open(itemDir + 'pro_test_item.txt').readlines()
featureDic = {}
for item in tqdm(itemData):
    name = item.split('/')[1].split('.')[0]
    image = caffe.io.load_image(itemDir + item.split()[0])
    net.predict([image], False)
    featureDic[name] = copy.deepcopy(net.blobs['loss3f'].data[0])

y = []
pred_y = []
dataDir = '/home/zhaokui/research/KDD/data/taobao/'
testData = open(dataDir + 'pro_test_set.txt').readlines()
for line in tqdm(testData):
    tmp = line.split()
    itemA = tmp[0].split('/')[1].split('.')[0]
    itemB = tmp[1].split('/')[1].split('.')[0]
    y.append(int(tmp[2]))
    pred_y.append(
        ed(
            np.array(featureDic[itemA]).reshape(1, -1),
            np.array(featureDic[itemB]).reshape(1, -1))[0][0])

with open('predict.dat', 'w') as f:
    pickle.dump((y, pred_y), f)

with open('item_feature.dat', 'w') as f:
    pickle.dump(featureDic, f)

print(auc(y, pred_y))
def euclidean_similarity(ratings):
    return ed(ratings)
Esempio n. 14
0
def hierarchical_clustering(features, k):
    """ Run the hierarchical agglomerative clustering algorithm.

    The algorithm is conceptually simple:

    Assign each point to its own cluster
    While the number of clusters is greater than k:
        Compute the distance between all pairs of clusters
        Merge the pair of clusters that are closest to each other

    We will use Euclidean distance to define distance between clusters.

    Recomputing the centroids of all clusters and the distances between all
    pairs of centroids at each step of the loop would be very slow. Thankfully
    most of the distances and centroids remain the same in successive
    iterations of the outer loop; therefore we can speed up the computation by
    only recomputing the centroid and distances for the new merged cluster.

    Even with this trick, this algorithm will consume a lot of memory and run
    very slowly when clustering large set of points. In practice, you probably
    do not want to use this algorithm to cluster more than 10,000 points.

    Args:
        features - Array of N features vectors. Each row represents a feature
            vector.
        k - Number of clusters to form.

    Returns:
        assignments - Array representing cluster assignment of each point.
            (e.g. i-th point is assigned to cluster assignments[i])
    """

    N, D = features.shape

    assert N >= k, 'Number of clusters cannot be greater than number of points'

    # Assign each point to its own cluster
    assignments = np.arange(N)  #luon dam bao [1,n_clusters]
    centers = np.copy(features)  #luon dam bao co n_clusters
    n_clusters = N
    print(centers.shape)
    matrix = ed(centers, centers)  # ta se delete dan dan mang matrix
    from scipy.spatial.distance import cdist
    while n_clusters > k:
        ### YOUR CODE HERE

        r = np.arange(matrix.shape[0])
        matrix[r[:, None] >= r] = np.max(matrix)
        center1, center2 = np.unravel_index(
            np.argmin(matrix),
            matrix.shape)  #index two centers that have minimum distance

        if center1 > center2:
            center1, center2 = center2, center1
        assignments[assignments == center2] = center1
        assignments[
            assignments >
            center2] -= 1  #gan lai de dam bao assignments thuoc khoang [1,n_clusters]

        matrix = np.delete(np.delete(matrix, center2, 1), center2,
                           0)  #xoa dong, cot id = center2 de thu nho matrix
        centers = np.delete(centers, center2, 0)  #gan lai de dam bao
        centers[center1] = np.mean(features[assignments == center1], axis=0)

        matrix[center1] = cdist(centers[center1].reshape(1, D),
                                centers)  #gan lai hang matrix
        matrix[:, center1] = matrix[center1]  # gan lai cot matrix
        n_clusters -= 1
        ### END YOUR CODE

    return assignments
Esempio n. 15
0
def euclideanDistance(a,b):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html
    aa = a.reshape(1, len(a))
    bb = b.reshape(1, len(b))
    return ed(aa,bb)