コード例 #1
0
ファイル: sandwich.py プロジェクト: EdwardBetts/metaviro
def visualize_class_separation(X, labels):
  _, (ax1,ax2) = pyplot.subplots(ncols=2)
  label_order = np.argsort(labels)
  ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest')
  ax2.imshow(pairwise_distances(labels[label_order,None]),
             interpolation='nearest')
  pyplot.show()
コード例 #2
0
ファイル: hdbscan_.py プロジェクト: xsongx/hdbscan
def _hdbscan_generic(X, min_samples=5, alpha=1.0,
                     metric='minkowski', p=2, leaf_size=None, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')

        distance_matrix = pairwise_distances(X, metric=metric, p=p)
    else:
        distance_matrix = pairwise_distances(X, metric=metric)

    mutual_reachability_ = mutual_reachability(distance_matrix,
                                               min_samples, alpha)

    min_spanning_tree = mst_linkage_core(mutual_reachability_)

    if gen_min_span_tree:
        result_min_span_tree = min_spanning_tree.copy()
        for index, row in enumerate(result_min_span_tree[1:], 1):
            candidates = np.where(np.isclose(mutual_reachability_[row[1]], row[2]))[0]
            candidates = np.intersect1d(candidates, min_spanning_tree[:index, :2].astype(int))
            candidates = candidates[candidates != row[1]]
            assert (len(candidates) > 0)
            row[0] = candidates[0]
    else:
        result_min_span_tree = None

    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, result_min_span_tree
コード例 #3
0
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    D = pairwise_distances(X, metric='euclidean')
    # Given that the actual labels are used, we can assume that S would be
    # positive.
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
    # Test without calculating D
    silhouette_metric = silhouette_score(X, y, metric='euclidean')
    assert_almost_equal(silhouette, silhouette_metric)
    # Test with sampling
    silhouette = silhouette_score(D, y, metric='precomputed',
                                  sample_size=int(X.shape[0] / 2),
                                  random_state=0)
    silhouette_metric = silhouette_score(X, y, metric='euclidean',
                                         sample_size=int(X.shape[0] / 2),
                                         random_state=0)
    assert(silhouette > 0)
    assert(silhouette_metric > 0)
    assert_almost_equal(silhouette_metric, silhouette)
    # Test with sparse X
    X_sparse = csr_matrix(X)
    D = pairwise_distances(X_sparse, metric='euclidean')
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
コード例 #4
0
def smart_initialize(data, k, seed=None):
    """
    Use k-means++ to initialize a good set of centroids
    :param data: whole dataset
    :param k: number of centroids
    :param seed: random seed
    :return: initial centroids
    """
    if seed is not None:  # useful for obtaining consistent results
        np.random.seed(seed)
    centroids = np.zeros((k, data.shape[1]))

    # Randomly choose the first centroid.
    # Since we have no prior knowledge, choose uniformly at random
    idx = np.random.randint(data.shape[0])
    centroids[0] = data[idx, :].toarray()
    # Compute distances from the first centroid chosen to all the other data points
    distances = pairwise_distances(data, centroids[0:1], metric='euclidean').flatten()

    for i in range(1, k):
        # Choose the next centroid randomly, so that the probability for each data point to be chosen
        # is directly proportional to its squared distance from the nearest centroid.
        # Roughly speaking, a new centroid should be as far as from other centroids as possible.
        idx = np.random.choice(data.shape[0], 1, p=distances / sum(distances))
        centroids[i] = data[idx, :].toarray()
        # Now compute distances from the centroids to all data points
        distances = np.min(pairwise_distances(data, centroids[0:i + 1], metric='euclidean'), axis=1)

    return centroids
コード例 #5
0
def _rsl_small_kdtree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')

        distance_matrix = pairwise_distances(X, metric=metric, p=p)
    else:
        distance_matrix = pairwise_distances(X, metric=metric)

    mutual_reachability_ = kdtree_mutual_reachability(X,
                                                      distance_matrix,
                                                      metric,
                                                      p=p,
                                                      min_points=k,
                                                      alpha=alpha)

    min_spanning_tree = mst_linkage_core(mutual_reachability_)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
コード例 #6
0
ファイル: hdbscan_.py プロジェクト: rbkreisberg/hdbscan
def _hdbscan_small_kdtree(X, min_cluster_size=5, min_samples=None, 
                          metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')

        distance_matrix = pairwise_distances(X, metric=metric, p=p)
    else:
        distance_matrix = pairwise_distances(X, metric=metric)

    mutual_reachability_ = kdtree_mutual_reachability(X, 
                                                      distance_matrix,
                                                      metric,
                                                      p=p,
                                                      min_points=min_samples)

    min_spanning_tree = mst_linkage_core(mutual_reachability_)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
    
    single_linkage_tree = label(min_spanning_tree)
    condensed_tree = condense_tree(single_linkage_tree, 
                                               min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)
    
    labels = -1 * np.ones(X.shape[0], dtype=int)
    for index, cluster in enumerate(cluster_list):
        labels[cluster] = index
    return labels, condensed_tree, single_linkage_tree, min_spanning_tree
コード例 #7
0
def class_separation(X, labels):
  unique_labels, label_inds = np.unique(labels, return_inverse=True)
  ratio = 0
  for li in xrange(len(unique_labels)):
    Xc = X[label_inds==li]
    Xnc = X[label_inds!=li]
    ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean()
  return ratio / len(unique_labels)
コード例 #8
0
ファイル: density_estimation.py プロジェクト: BTY2684/astroML
    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
コード例 #9
0
    def __call__(self, X_train, X_test, y_train, y_test):
        X = np.vstack([X_train, X_test])
        y = np.hstack([y_train, y_test])
        unique_labels, label_inds = np.unique(y, return_inverse=True)
        ratio = 0
        for li in range(len(unique_labels)):
            Xc = X[label_inds == li]
            Xnc = X[label_inds != li]
            ratio += pairwise_distances(Xc).mean() \
                / pairwise_distances(Xc, Xnc).mean()

        return -ratio / len(unique_labels)
コード例 #10
0
def outlier_clusters_ward(x, y, skill=None, memory=None):
    # TODO: incorporate skill
    data = np.vstack((x, y)).T

    if len(data) == 0:
        # uh.
        print 'clustering: NO cluster members!'
        cluster_centers = np.array([[-1, -1]])
        cluster_labels = []
        labels = []
        n_clusters = 0
        dist_within = np.array([])

    elif len(data) == 1:
        print 'clustering: only 1 data point!'
        cluster_centers = data
        cluster_labels = [0]
        labels = np.array([0])
        n_clusters = 1
        dist_within = np.array([0])

    else:
        dist_within = 1000
        dist_max = 75
        n_clusters = 0
        n_clusters_max = 10

        clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                memory=memory)

        # while dist_within > dist_max, keep adding clusters
        while (dist_within > dist_max) * (n_clusters < n_clusters_max):
            # iterate n_clusters
            n_clusters += 1
            clusterer.set_params(n_clusters=n_clusters)

            # cluster
            labels = clusterer.fit_predict(data)

            # get cluster_centers
            cluster_labels = range(n_clusters)
            cluster_centers = np.array([np.mean(data[labels == i], axis=0)
                                        for i in cluster_labels])

            # find dist_within: the maximum pairwise distance inside a cluster
            dist_within = np.max([np.max(pairwise_distances(
                                  data[labels == i]))
                                  for i in cluster_labels])

    dist_within_final = np.array([np.max(pairwise_distances(
            data[labels == i])) for i in cluster_labels])

    return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
コード例 #11
0
def test_precomputed(random_state=42):
    """Tests unsupervised NearestNeighbors with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    rng = np.random.RandomState(random_state)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    for method in ['kneighbors']:
        # TODO: also test radius_neighbors, but requires different assertion

        # As a feature matrix (n_samples by n_features)
        nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
        nbrs_X.fit(X)
        dist_X, ind_X = getattr(nbrs_X, method)(Y)

        # As a dense distance matrix (n_samples by n_samples)
        nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
                                            metric='precomputed')
        nbrs_D.fit(DXX)
        dist_D, ind_D = getattr(nbrs_D, method)(DYX)
        assert_array_almost_equal(dist_X, dist_D)
        assert_array_almost_equal(ind_X, ind_D)

        # Check auto works too
        nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                            metric='precomputed')
        nbrs_D.fit(DXX)
        dist_D, ind_D = getattr(nbrs_D, method)(DYX)
        assert_array_almost_equal(dist_X, dist_D)
        assert_array_almost_equal(ind_X, ind_D)

        # Check X=None in prediction
        dist_X, ind_X = getattr(nbrs_X, method)(None)
        dist_D, ind_D = getattr(nbrs_D, method)(None)
        assert_array_almost_equal(dist_X, dist_D)
        assert_array_almost_equal(ind_X, ind_D)

        # Must raise a ValueError if the matrix is not of correct shape
        assert_raises(ValueError, getattr(nbrs_D, method), X)

    target = np.arange(X.shape[0])
    for Est in (neighbors.KNeighborsClassifier,
                neighbors.RadiusNeighborsClassifier,
                neighbors.KNeighborsRegressor,
                neighbors.RadiusNeighborsRegressor):
        print(Est)
        est = Est(metric='euclidean')
        est.radius = est.n_neighbors = 1
        pred_X = est.fit(X, target).predict(Y)
        est.metric = 'precomputed'
        pred_D = est.fit(DXX, target).predict(DYX)
        assert_array_almost_equal(pred_X, pred_D)
コード例 #12
0
 def find_distance_matrix(self, metric='cosine'):
     '''
     compute distance matrix between topis using cosine or euclidean
     distance (default=cosine distance)
     '''
     if metric == 'cosine':
         self.distance_matrix = pairwise_distances(self.topics,
                                                   metric='cosine')
         # diagonals should be exactly zero, so remove rounding errors
         numpy.fill_diagonal(self.distance_matrix, 0)
     if metric == 'euclidean':
         self.distance_matrix = pairwise_distances(self.topics,
                                                   metric='euclidean')
コード例 #13
0
ファイル: DES.py プロジェクト: hippozhu/dcs
  def update_clfs_M(self, clfs, M):
    self.clfs = clfs
    self.M = M

    self.knn_test_dist, self.knn_test =  NearestNeighbors(self.k,  algorithm='brute', metric='mahalanobis', VI=self.M).fit(self.X_train).kneighbors(self.X_test)
    self.preds_train = np.array([e.predict(self.X_train) for e in clfs]).T
    self.preds_proba_train = np.array([e.predict_proba(self.X_train) for e in clfs]).swapaxes(0,1)
    self.preds_proba_train_smoothed = self.preds_proba_train + 0.01
    self.preds_test = np.array([e.predict(self.X_test) for e in clfs]).T
    self.preds_proba_test = np.array([e.predict_proba(self.X_test) for e in clfs]).swapaxes(0,1)
    self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(self.preds_train, self.y_train)])
    self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(self.preds_test, self.y_test)])
    self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming')
    self.pd_preds_test = pairwise_distances(self.preds_test, self.preds_train, metric='hamming')
コード例 #14
0
ファイル: lmnn_pp.py プロジェクト: hippozhu/dcs
 def update_input(self, clf):
   preds_train = np.array([e.predict(self.X_train) for e in clf.estimators_]).T
   self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(preds_train, self.y_train)])
   preds_test = np.array([e.predict(self.X_test) for e in clf.estimators_]).T
   self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(preds_test, self.y_test)])
   self.G = np.zeros(self.M.shape)
   self.active_set = None
   self.ij = []
   self.ijl = []
   self.loss = np.inf
   self.pd_pp = pairwise_distances(self.pp_train, metric='hamming')
   np.fill_diagonal(self.pd_pp, np.inf)
   self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming')
   self.step_size = self.alpha
   self.step_size_break = False
コード例 #15
0
ファイル: ds_outlier.py プロジェクト: baothien/tiensy
def ds_clustering(clusters,support_vectors, f_values, new_element):
    '''
    clustering the new element 
    Efficient Out-of-Sample extension of Dominant set clusters
    Massimiliano et. al., NIPS 2004
    for all h in S: if sum(a(h,i)*x(h)  > f(x)  then i is assigned to S)      
    '''
    if clusters ==None or support_vectors==None or new_element == None:
        return None
    sum_axs = []
    for i in np.arange(len(clusters)):
        S = clusters[i]
        S_old = S.copy()
        x = support_vectors[i]
        
        #print 'len S ', len(S), 'len x', len(x)
        
        from sklearn.metrics import euclidean_distances , pairwise_distances
        #euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
        new_arr = [new_element]
        dis = pairwise_distances(new_arr,S, metric='sqeuclidean')
        sigma2 = np.median(dis)    
        a_hj = np.exp(-dis / sigma2)      

        #print dis, a_hj
        sum_ax = 0.        
        for h in np.arange(len(S_old)):            
            sum_ax = sum_ax + a_hj[0][h]*x[h]
        #print 'i =',i,' sum_ax', sum_ax, 'f_values ', f_values[i]   
        sum_axs.append(sum_ax)
        
    #print np.argmax(sum_axs), '  ', np.max(sum_axs)   
    if np.max(sum_axs) >= 0.5*f_values[np.argmax(sum_axs)]:
            return np.argmax(sum_axs)
    return None
コード例 #16
0
def display_single_tf_idf_cluster(cluster, df_map):
    '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices'''

    wiki_subset = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid = cluster['centroid']

    # Print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    for i in range(5):
        print('{0:s}:{1:.3f}'.format(df_map[df_map['idx'] == idx[i]]['word'].values[0], centroid[idx[i]]))
    print('')

    # Compute distances from the centroid to all data points in the cluster.
    distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten()
    # compute nearest neighbors of the centroid within the cluster.
    nearest_neighbors = distances.argsort()
    # For 8 nearest neighbors, print the title as well as first 180 characters of text.
    # Wrap the text at 80-character mark.
    for i in range(8):
        text = ' '.join(wiki_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
        print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki_subset.iloc[nearest_neighbors[i]]['name'],
              distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
        print(text)
    print()
コード例 #17
0
ファイル: test_incremental.py プロジェクト: ckanu13k/graphs
 def test_l1_precomputed(self):
   dist = pairwise_distances(self.pts, metric='l1')
   k_range = range(1, 5)
   incr_gen = incremental_neighbor_graph(dist, precomputed=True, k=k_range)
   for k, G in zip_longest(k_range, incr_gen):
     expected = ngraph(dist, precomputed=True, k=k)
     assert_array_almost_equal(G.matrix(dense=True), expected)
コード例 #18
0
ファイル: test_spectral.py プロジェクト: osdf/scikit-learn
def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver
        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, mode="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers), random_state=0, mode="amg")
コード例 #19
0
def visualize_document_clusters(wiki, tf_idf, centroids, cluster_assignment, k,
                                map_index_to_word, display_content=True):
    '''wiki: original dataframe
       tf_idf: data matrix, sparse matrix format
       map_index_to_word: SFrame specifying the mapping betweeen words and column indices
       display_content: if True, display 8 nearest neighbors of each centroid'''
    
    print('==========================================================')

    # Visualize each cluster c
    for c in xrange(k):
        # Cluster heading
        print('Cluster {0:d}    '.format(c)),
        # Print top 5 words with largest TF-IDF weights in the cluster
        idx = centroids[c].argsort()[::-1]
        for i in xrange(5): # Print each word along with the TF-IDF weight
            print('{0:s}:{1:.3f}'.format(map_index_to_word['category'][idx[i]], centroids[c,idx[i]])),
        print('')
        
        if display_content:
            # Compute distances from the centroid to all data points in the cluster,
            # and compute nearest neighbors of the centroids within the cluster.
            distances = pairwise_distances(tf_idf, [centroids[c]], metric='euclidean').flatten()
            distances[cluster_assignment!=c] = float('inf') # remove non-members from consideration
            nearest_neighbors = distances.argsort()
            # For 8 nearest neighbors, print the title as well as first 180 characters of text.
            # Wrap the text at 80-character mark.
            for i in xrange(8):
                text = ' '.join(wiki[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
                print('\n* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki[nearest_neighbors[i]]['name'],
                    distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
        print('==========================================================')
コード例 #20
0
ファイル: cbpktst.py プロジェクト: smkia/cbpktst
def precompute_gaussian_kernels(XX, YY, verbose=False):
    """For each unit, precompute Gaussian kernel between the trials of
    the two samples XX and YY. Estimate each sigma2 parameter as median
    distance between the trials of each sample.
    """
    if verbose: print("Pre-computing the kernel matrix for each unit.")
    n_units = XX.shape[1] # or YY.shape[1]
    Ks = [] # here we store all the kernel matrices
    sigma2s = np.zeros(n_units) # here we store all the sigma2s, one per unit
    m = XX.shape[0]
    n = YY.shape[0]
    for i in range(n_units):
        if verbose: print("Unit %s" % i),
        X = XX[:,i,:].copy()
        Y = YY[:,i,:].copy()
        if verbose: print("Computing Gaussian kernel."),
        dm = pairwise_distances(np.vstack([X, Y]), metric='sqeuclidean')
        # Heuristic: sigma2 is the median value among all pairwise
        # distances between X and Y. Note: should we use just
        # dm[:m,m:] or all dm?
        sigma2 = np.median(dm[:m,m:])**2 
        sigma2s[i] = sigma2
        if verbose: print("sigma2 = %s" % sigma2)
        K = np.exp(-dm / sigma2)
        Ks.append(K)

    return Ks, sigma2s
コード例 #21
0
def assign_clusters(data, centroids):
    # Compute distances between each data point and the set of centroids:
    distances_from_centroids = pairwise_distances(data, centroids, metric='euclidean')
    
    # Compute cluster assignments for each data point:
    cluster_assignment = np.apply_along_axis(np.argmin, axis = 1, arr = distances_from_centroids)
    return cluster_assignment
コード例 #22
0
 def sim_calc(self):
     nt = self.corpora[0]
     self.scores = {}
     for corp in self.corpora:
         i_nt = []
         i_c2 = []
         rows = self.ekk_rows[corp[0]]
         for i, word in enumerate(self.ekk_rows['NT']):
             if word in rows:
                 i_nt.append(i)
                 i_c2.append(self.ekk_rows[corp[0]].index(word))
         d_c2 = np.memmap(
             '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format(
                 self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd),
             dtype='float32', shape=(len(rows), len(rows)))[i_c2]
         d_c2 = d_c2[:, i_c2]
         d_nt = np.memmap(
             '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format(
                 self.base, nt[0], nt[1], nt[2], self.english, self.prefix,
                 self.svd), dtype='float32',
             shape=(len(self.ekk_rows['NT']), len(self.ekk_rows['NT'])))[
             i_nt]
         d_nt = d_nt[:, i_nt]
         self.scores['{0}_{1}'.format('NT', corp[0])] = np.average(np.diag(
             1 - pairwise_distances(d_nt, d_c2, metric='cosine',
                                    n_jobs=12)))
コード例 #23
0
def logpdf_diagonal_gaussian(x, mean, cov):
    """
    Compute log-pdf of a multivariate Gaussian distribution with diagonal covariance at a given point x.
    A multivariate Gaussian distribution with a diagonal covariance is equivalent
    to a collection of independent Gaussian random variables.

    The log-pdf will be computed for each row of x.
    mean and cov should be given as 1D numpy arrays.

    :param x: a sparse matrix
    :param mean: means of variables
    :param cov: covariances of variables
    :return: log-pdf of a multivariate Gaussian distribution
    """
    n = x.shape[0]
    dim = x.shape[1]
    assert(dim == len(mean) and dim == len(cov))

    # multiply each i-th column of x by (1/(2*sigma_i)), where sigma_i is sqrt of variance of i-th variable.
    scaled_x = x.dot(diag(1. / (2 * np.sqrt(cov))))
    # multiply each i-th entry of mean by (1/(2*sigma_i))
    scaled_mean = mean / (2 * np.sqrt(cov))

    # sum of pairwise squared Eulidean distances gives SUM[(x_i - mean_i)^2/(2*sigma_i^2)]
    dist_sqr = pairwise_distances(scaled_x, [scaled_mean], 'euclidean').flatten() ** 2
    return -np.sum(np.log(np.sqrt(2 * np.pi * cov))) - dist_sqr
コード例 #24
0
ファイル: swiss_roll.py プロジェクト: all-umass/graphs
def main():
  X, theta = swiss_roll(8, 500, return_theta=True)
  D = pairwise_distances(X)
  graph_info = [
    _c('5-NN', neighbor_graph, D, k=6, precomputed=True),
    _c('b-matching', b_matching, D, 6),
    _c('gabriel', gabriel_graph, X),
    _c('rel. neighborhood', relative_neighborhood_graph,D,metric='precomputed'),
    _c('manifold spanning', manifold_spanning_graph, X, 2),
    _c('L1', sparse_regularized_graph, X, kmax=10, sparsity_param=0.0005),
    _c('SMCE', _smce_symm_dist, X, kmax=25, sparsity_param=5),
    _c('SAFFRON', saffron, X, q=15, k=5, tangent_dim=2),
    _c('MST', mst, D, metric='precomputed'),
    _c('dMST', disjoint_mst, D, metric='precomputed'),
  ]

  print('Plotting graphs & embeddings')
  fig1, axes1 = plt.subplots(nrows=3, ncols=3, subplot_kw=dict(projection='3d'))
  fig2, axes2 = plt.subplots(nrows=3, ncols=3)
  fig1.suptitle('Original Coordinates')
  fig2.suptitle('Isomap Embeddings')

  for ax1, ax2, info in zip(axes1.flat, axes2.flat, graph_info):
    label, G, gg, emb, mask = info
    G.plot(X, ax=ax1, title=label, vertex_style=dict(c=theta))
    gg.plot(emb, ax=ax2, title=label, vertex_style=dict(c=theta[mask]))
    ax1.view_init(elev=5, azim=70)
    ax1.set_axis_off()
    ax2.set_axis_off()
  plt.show()
コード例 #25
0
ファイル: classifiers.py プロジェクト: dnrb/categorization
    def discriminate_activation_distance(self):
        testset = A([ self.get_input_item(t) for t in 
                      self.data.discrimination_stimuli ])
        f = self.data.nT
        activation = np.array([np.linalg.norm(self.map[:,:,f:] - t[f:], 
                                            ord = 2, axis = 2).reshape(1,-1)[0]
                             for t in testset])
        distances = pairwise_distances(activation, metric = 'euclidean')
        dn, tn = self.data.dirname, self.data.discrimination_data
        tz = self.predict_terms(self.data.discrimination_stimuli).argmax(1)
        terms = self.data.terms[tz]

        d_fn = '%s/AD_discrimination_terms_%s.csv' % (dn, tn)
        dc_fn = '%s/AD_discrimination_confusability_%s.csv' % (dn, tn)
        with open(d_fn, 'a') as o:
            if os.path.getsize(d_fn) == 0: 
                o.write('simulation,time,stimulus,term\n')
            for i,t in enumerate(terms):
                o.write('%d,%d,%d,%s\n' % (self.simulation, self.time, i, t))
        with open(dc_fn, 'a') as o:
            if os.path.getsize(dc_fn) == 0: 
                o.write('simulation,time,stimulus.1,stimulus.2,')
                o.write('term.1,term.2,distance\n')
            for i in range(distances.shape[0]):
                for j in range(i+1, distances.shape[0]):
                    o.write('%d,%d,%d,%d,%s,%s,%.3f\n' %
                            (self.simulation, self.time, i, j, terms[i], 
                             terms[j], distances[i,j]))
        return
コード例 #26
0
ファイル: app.py プロジェクト: acatwang/tweetsCluster
    def rankInCluster(self,labels,centers_features,K,X,tweets=None):
        clusters = dict((clusId,{'all':[],'best':"",'first':"",'words':"","n":0,'sentiment':0}) for clusId in range(K))
        if not tweets:
            tweets = self.tweets
        # In each cluster, do the following :
        # 1) sort tweets by created time in descending order
        # 2) get the first tweet (in terms of time)
        # 3) find the tweet that is closet to the cluster centroid (best tweet)

        for i,label in enumerate(labels):
            clusters[label]['all'].append(tweets[i])
            clusters[label]['n'] += 1

        for label in labels:
            clusters[label]['all'] = sorted(clusters[label]['all'], key=lambda x:x.time, reverse=True)
            clusters[label]['first'] = clusters[label]['all'][-1].printTweet()

        # Find the best tweet and avg sentiment in each cluster
        for clusId in xrange(K):
            print "{} tweets in cluster {}".format(len(clusters[clusId]['all']), clusId)
            tweetIdxInClus = np.where(labels == clusId)
            clusters[clusId]['sentiment'] = np.mean(X[tweetIdxInClus,-1])
            if not clusters[clusId]["n"]:
                break
            #print tweetIdxInClus
            centerCoord = centers_features[clusId].reshape(1,-1)
            distToCtr = pairwise_distances(X[tweetIdxInClus], centerCoord)  # dimension: (n_tweets, 1)


            # Calculate tweet popularity/quality feature
            popularity = []
            for i,t in enumerate(tweets):
                if i in tweetIdxInClus[0]:
                    popularity.append([t.retweetCnt,t.favCnt,t.isRetweet,t.followers])
            popularity = np.array(popularity)  # n_tweet X 5
            coef = np.array([.5,.5,-.8,.2]) # hard-coded coefficient
            #print "popularity:{}".format(popularity.dot(coef).shape)
            norm_popularity = normalize(popularity).dot(coef).reshape(-1,1)
            #print norm_popularity
            #print norm_popularity.shape

            feat = np.add(distToCtr, norm_popularity)
            bestTweetId = np.argmax(feat)
            clusters[clusId]['best'] = tweets[tweetIdxInClus[0][bestTweetId]].printTweet()

        # Get the top words in each cluster
        sorted_centers_features = centers_features.argsort()[:, ::-1]
        for ctr in xrange(K):
            top3words = []
            found = 0
            for field in sorted_centers_features[ctr]: # Get the top 3 common words
                try:
                    top3words.append(self.tfidfDict[field].encode('utf-8', 'ignore'))
                    if found == 2:
                        break
                    found +=1
                except IndexError:
                    continue
            clusters[ctr]['words'] = "/".join(top3words)
        return clusters
コード例 #27
0
ファイル: featurevector.py プロジェクト: hsensoy/uparse
def binned():
    #reader, pointer, dialect = csvreader("german.embeddings",verbose=True)
    #dialect.quoting = csv.QUOTE_NONE

    global featureMatrix
    global wordVector

    wordVector = []
    features = []
    for record in reader("german.embeddings", verbose=True, delimeter="\t", nrows=2000):
        word, featurelst = record[0], [float(f) for f in record[1:]]

        if VERBOSE:
            print >> sys.stderr, word

        features.append(featurelst)
        wordVector.append(word)

    featureMatrix = np.array(features)
    #featureMatrix = StandardScaler().fit_transform(featureMatrix)

    #print featureMatrix
    #print featureMatrix.shape
    euclideandist = pairwise_distances(featureMatrix)

    #print max(euclideandist.flatten())

    #print euclideandist

    if VERBOSE:
        print >> sys.stderr, wordVector[1:10]
        print >> sys.stderr, featureMatrix
        print >> sys.stderr, featureMatrix.shape

    from multiprocessing import Pool

    #p = Pool(3)

    #p.map(kmeans, range(8, 100, 2), 2)

    def asymmetric_n(min, max):
        next = min

        while next < max:
            yield next

            next += int(math.log(next, 2)) - 1

    #for n in range(16,23):
    #    docluster(n,algorithm='kmeans')

    from itertools import product

    results = []
    for eps, MinPts in product([5.5], [8]):
        sil, n = docluster(algorithm='AffinityPropagation', showClusters=True, eps_in=eps, MinPts_in=MinPts)

        results.append((sil, n, eps, MinPts))

    print sorted(results, key=lambda x: x[0], reverse=True)
コード例 #28
0
ファイル: classifiers.py プロジェクト: dnrb/categorization
 def discriminate(self):
     # calculates the between-cell map distance for the BMUs of an array of 
     # stimuli used in Beekhuizen & Stevenson 2016.
     # only uses sinij/goluboj or blue
     t_set = { '111' : ['BU'], '112' : ['sinij', 'goluboj'] }
     lg = self.parameters['target language']
     t_ix = [i for i,t in enumerate(self.data.terms) if t in t_set[lg]]
     testset = A([ self.get_input_item(t) for t in 
                   self.data.discrimination_stimuli ])
     positions = A([ A(self.get_bmu_ix(t)) for t in testset ])
     distances = pairwise_distances(positions, metric = 'euclidean')
     dn, tn = self.data.dirname, self.data.discrimination_data
     pt = self.predict_terms(self.data.discrimination_stimuli)
     terms = [t_set[lg][t] for t in pt[:,t_ix].argmax(1)]
     #print(terms)
     d_fn = '%s/discrimination_terms_%s_bo.csv' % (dn, tn)
     dc_fn = '%s/discrimination_confusability_%s_bo.csv' % (dn, tn)
     with open(d_fn, 'a') as o:
         if os.path.getsize(d_fn) == 0: 
             o.write('simulation,time,stimulus,term\n')
         for i,t in enumerate(terms):
             o.write('%d,%d,%d,%s\n' % (self.simulation, self.time, i, t))
     with open(dc_fn, 'a') as o:
         if os.path.getsize(dc_fn) == 0: 
             o.write('simulation,time,stimulus.1,stimulus.2,')
             o.write('term.1,term.2,distance\n')
         for i in range(distances.shape[0]):
             for j in range(i+1, distances.shape[0]):
                 o.write('%d,%d,%d,%d,%s,%s,%.3f\n' %
                         (self.simulation, self.time, i, j, terms[i], 
                          terms[j], distances[i,j]))
     return
コード例 #29
0
ファイル: ITML.py プロジェクト: nihaoCC/DRP
    def train(self, reactions, predictor_headers, response_headers, filename):
        print "Preparing arrays"
        data, labels = self._prepareArrays(reactions, predictor_headers, response_headers)
        old_settings = np.seterr(divide='raise') # we don't want division by zero to pass

        # This is how metric learn determines bounds internally
        # but the lower bound can be zero this way (especially for low-dimensional data)
        # which causes divide by zero errors
        print "Calculating bounds"
        pair_dists = pairwise_distances(data)
        bounds = np.percentile(pair_dists, (5, 95))
        # the extra check ensures against divide-by-zero errors later
        if bounds[0] == 0:
            bounds[0] = min(pair_dists[np.nonzero(pair_dists)])
            print "Lowerbound was 0. Set to {}".format(bounds[0])
        
        print "Preparing {} constraints with bounds of ({}, {})".format(self.num_constraints, bounds[0], bounds[1])
        constraints = self.metric_object.prepare_constraints(labels, data.shape[0], self.num_constraints)
        print "Fitting"
        self.metric_object.fit(data, constraints, bounds=bounds)
        
        self.save(filename)
        np.seterr(**old_settings)
        
        print "Transforming training set"
        return self.metric_object.transform()
コード例 #30
0
ファイル: itml.py プロジェクト: ChihChengLiang/metric_learn
 def __init__(self, X, constraints, bounds=None, A0=None):
   """
   X: (n x d) data matrix - each row corresponds to a single instance
   A0: [optional] (d x d) initial regularization matrix, defaults to identity
   constraints: tuple of arrays: (a,b,c,d) indices into X, such that:
     d(X[a],X[b]) < d(X[c],X[d])
   bounds: (pos,neg) pair of bounds on similarity, such that:
     d(X[a],X[b]) < pos
     d(X[c],X[d]) > neg
   """
   self.X = X
   # check to make sure that no two constrained vectors are identical
   a,b,c,d = constraints
   ident = _vector_norm(self.X[a] - self.X[b]) > 1e-9
   a, b = a[ident], b[ident]
   ident = _vector_norm(self.X[c] - self.X[d]) > 1e-9
   c, d = c[ident], d[ident]
   self.C = a,b,c,d
   # init bounds
   if bounds is None:
     self.bounds = np.percentile(pairwise_distances(X), (5, 95))
   else:
     assert len(bounds) == 2
     self.bounds = bounds
   # intialize metric
   if A0 is None:
     self.A = np.identity(self.X.shape[1])
   else:
     self.A = A0
コード例 #31
0
    def compute_neighbors(
            self,
            n_neighbors: int = 30,
            knn: bool = True,
            n_pcs: Optional[int] = None,
            use_rep: Optional[str] = None,
            method: _Method = 'umap',
            random_state: AnyRandom = 0,
            write_knn_indices: bool = False,
            metric: _Metric = 'euclidean',
            metric_kwds: Mapping[str, Any] = MappingProxyType({}),
    ) -> None:
        """\
        Compute distances and connectivities of neighbors.

        Parameters
        ----------
        n_neighbors
             Use this number of nearest neighbors.
        knn
             Restrict result to `n_neighbors` nearest neighbors.
        {n_pcs}
        {use_rep}

        Returns
        -------
        Writes sparse graph attributes `.distances` and `.connectivities`.
        Also writes `.knn_indices` and `.knn_distances` if
        `write_knn_indices==True`.
        """
        from sklearn.metrics import pairwise_distances
        start_neighbors = logg.debug('computing neighbors')
        if n_neighbors > self._adata.shape[0]:  # very small datasets
            n_neighbors = 1 + int(0.5 * self._adata.shape[0])
            logg.warning(
                f'n_obs too small: adjusting to `n_neighbors = {n_neighbors}`')
        if method == 'umap' and not knn:
            raise ValueError('`method = \'umap\' only with `knn = True`.')
        if method == 'rapids' and metric != 'euclidean':
            raise ValueError(
                "`method` 'rapids' only supports the 'euclidean' `metric`.")
        if method not in {'umap', 'gauss', 'rapids'}:
            raise ValueError(
                "`method` needs to be 'umap', 'gauss', or 'rapids'.")
        if self._adata.shape[0] >= 10000 and not knn:
            logg.warning(
                'Using high n_obs without `knn=True` takes a lot of memory...')
        # do not use the cached rp_forest
        self._rp_forest = None
        self.n_neighbors = n_neighbors
        self.knn = knn
        X = _choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs)
        # neighbor search
        use_dense_distances = (metric == 'euclidean'
                               and X.shape[0] < 8192) or knn == False
        if use_dense_distances:
            _distances = pairwise_distances(X, metric=metric, **metric_kwds)
            knn_indices, knn_distances = _get_indices_distances_from_dense_matrix(
                _distances, n_neighbors)
            if knn:
                self._distances = _get_sparse_matrix_from_indices_distances_numpy(
                    knn_indices, knn_distances, X.shape[0], n_neighbors)
            else:
                self._distances = _distances
        elif method == 'rapids':
            knn_indices, knn_distances = compute_neighbors_rapids(
                X, n_neighbors)
        else:
            # non-euclidean case and approx nearest neighbors
            if X.shape[0] < 4096:
                X = pairwise_distances(X, metric=metric, **metric_kwds)
                metric = 'precomputed'
            knn_indices, knn_distances, forest = compute_neighbors_umap(
                X,
                n_neighbors,
                random_state,
                metric=metric,
                metric_kwds=metric_kwds)
            # very cautious here
            try:
                if forest:
                    self._rp_forest = _make_forest_dict(forest)
            except:
                pass
        # write indices as attributes
        if write_knn_indices:
            self.knn_indices = knn_indices
            self.knn_distances = knn_distances
        start_connect = logg.debug('computed neighbors', time=start_neighbors)
        if not use_dense_distances or method in {'umap', 'rapids'}:
            # we need self._distances also for method == 'gauss' if we didn't
            # use dense distances
            self._distances, self._connectivities = _compute_connectivities_umap(
                knn_indices,
                knn_distances,
                self._adata.shape[0],
                self.n_neighbors,
            )
        # overwrite the umap connectivities if method is 'gauss'
        # self._distances is unaffected by this
        if method == 'gauss':
            self._compute_connectivities_diffmap()
        logg.debug('computed connectivities', time=start_connect)
        self._number_connected_components = 1
        if issparse(self._connectivities):
            from scipy.sparse.csgraph import connected_components
            self._connected_components = connected_components(
                self._connectivities)
            self._number_connected_components = self._connected_components[0]
コード例 #32
0
x = x.reshape((numSamples, -1)).astype(np.float) / 255.0
x_r = x_r.reshape((numSamples, -1))
x_nr = x_nr.reshape((numSamples, -1))

locations = [np.where(y == i)[0] for i in range(10)]

result = []

for i in range(10):
    sampleMat = x[locations[i], ...]
    sampleMat_r = x_r[locations[i], ...]
    sampleMat_nr = x_nr[locations[i], ...]

    toDelete = [j * j for j in range(sampleMat.shape[0])]

    dist = np.delete(pairwise_distances(sampleMat).flatten(), toDelete)
    mean_ = np.mean(dist)
    std_ = np.std(dist)
    norm_ = norm(mean_, std_)

    dist_r = np.delete(pairwise_distances(sampleMat_r).flatten(), toDelete)
    mean_r = np.mean(dist_r)
    std_r = np.std(dist_r)
    norm_r = norm(mean_r, std_r)

    dist_nr = np.delete(pairwise_distances(sampleMat_nr).flatten(), toDelete)
    mean_nr = np.mean(dist_nr)
    std_nr = np.std(dist_nr)
    norm_nr = norm(mean_nr, std_nr)

    # show histogram
コード例 #33
0
      str(digInd))

# In[14]:

subset0 = X[y[:] == 0]

# In[15]:

subset1 = X[y[:] == 1]

# #### 1E -- Genuine vs. Imposter Calculations

# In[16]:

#create symetric  matrices representing distances between all genuine zeros, where the diagonal is 0 (same value)
gen0 = pairwise_distances(subset0, subset0)
gen1 = pairwise_distances(subset1, subset1)
#create matrix representing the distance between all 1s and all 0s
imp = pairwise_distances(subset0, subset1)

# In[17]:

gen0DiffVals = []
#create an array of the lower triangle of the genuine 0 matrices
for i in range(len(gen0)):
    for j in range(len(gen0)):
        if j >= i:
            continue
        else:
            gen0DiffVals.append(gen0[i][j])
コード例 #34
0
 def compute_fitness(self, X):
     if len(X.shape) == 1:
         X = X.reshape(1, -1)
     distances = pairwise_distances(X, self.L, metric=self.distance)
     membership = argmin(distances, axis=1)
     return (-distances[range(distances.shape[0]), membership].sum())
コード例 #35
0
ファイル: app.py プロジェクト: copquesz/livedwine-python-api
def get_recommendations_system_collaborative():
    # Key to works.
    try:
        wine_key = request.args.get('wine_key')
    except KeyError:
        return "Wine Key cannot be null"

    # Get database connection
    conn = get_database_connection()

    # Prepare data frames.
    df = load_database_with_wines_and_ratings_features(conn)
    wines = load_database_with_wines_features(conn)
    ratings = load_database_with_ratings_features(conn)

    # Close connection.
    close_database_connection(conn)

    # Average wine ratings
    wines_df_stats = df.groupby('wine_name').agg(
        {'rating': [np.size, np.mean]})

    # Filters the statistical dataset with wines that have more than x analyzes.
    min_10 = wines_df_stats['rating']['size'] >= 3
    wines_df_stats[min_10].sort_values([('rating', 'mean')], ascending=False)

    # Pivot Table
    matrix_df = ratings.pivot_table(index=['wine_id'],
                                    columns=['user_id'],
                                    values=['rating']).reset_index(drop=True)
    matrix_df.fillna(0, inplace=True)

    # Calculate Cosine Similarity.
    wines_similarity = 1 - pairwise_distances(matrix_df.to_numpy(),
                                              metric='cosine')
    np.fill_diagonal(wines_similarity, 0)

    # Set Similarities to Matrix data frame.
    matrix_df = pd.DataFrame(wines_similarity)

    # Recommendation System
    try:
        wine = wines[wines['wine_name'] == wine_key].index.tolist()
        wine = wine[0]

        wines['similarity'] = matrix_df.iloc[wine]
        wines.drop_duplicates('wine_id')
        wines.drop(columns=[
            'alcohol_content', 'country', 'grape', 'harmonization', 'harvest',
            'producer', 'region', 'service', 'type', 'volume'
        ],
                   axis=1,
                   inplace=True)

        result = pd.DataFrame(
            wines.sort_values(['similarity'], ascending=False))

        return jsonify(json.loads(result[0:int(10)].to_json(orient='records')))

    except:
        return "Wine not found."
コード例 #36
0
from sklearn.metrics import pairwise_distances
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

# --- READING DATA ---
digits = load_digits().data

# --- BUILDING SIMILARITY MATRIX ---
SMatrix = pairwise_distances(digits, metric='sqeuclidean')
コード例 #37
0
np.random.seed(100)

# set the number of observed data points
n = 200

# draw random points from a log-normal and a gaussian distributions
Y = np.random.lognormal(mean=2, sigma=0.3, size=n)
X = np.random.normal(loc=np.exp(2.0 + 0.3**2 / 2.0),
                     scale=0.3 * np.exp(2.0),
                     size=n)

XX = X[:, np.newaxis]
YY = Y[:, np.newaxis]

# compute MMD/K-S two sample tests and their null distributions
sigma2 = np.median(pairwise_distances(XX, YY, metric='euclidean'))**2 * 2.0
mmd2u, mmd2u_null, p_value = two_sample_test(XX,
                                             YY,
                                             model='MMD',
                                             kernel_function='rbf',
                                             gamma=1.0 / sigma2,
                                             iterations=5000,
                                             verbose=True,
                                             n_jobs=1)

ks, ks_null, ks_p_value = two_sample_test(XX,
                                          YY,
                                          model='KS',
                                          iterations=5000,
                                          verbose=True,
                                          n_jobs=1)
コード例 #38
0
    def _test(self, args, epoch, disc, gen, test_loader, test_output_dir):
        mse_criterion = nn.MSELoss()
        cls_criterion = nn.CrossEntropyLoss()
        _loss_g, _loss_cls_gen, _loss_adv_gen = 0., 0., 0.
        _loss_d, _loss_cls, _loss_adv = 0., 0., 0.
        _loss_recon, _loss_mse = 0., 0.
        _loss = 0.
        ypred, ypred_gen = [], []
        ytrue, ytrue_gen = [], []
        cls_count = [0] * 10

        class_featmaps = np.zeros((10, 1000, 256 * 14 * 14))
        class_featmaps_gen = np.zeros((10, 1000, 256 * 14 * 14))
        class_idx = [0] * 10

        for i, (inputs, featmaps, targets, indexes) in enumerate(test_loader):
            inputs, featmaps, targets = inputs.to(args.device), featmaps.to(
                args.device), targets.to(args.device)

            feats, gen_targets = self._sample_vecs_index(inputs.shape[0])
            feats, gen_targets = feats.to(args.device), gen_targets.to(
                args.device)
            gen_image = gen(feats.unsqueeze(2).unsqueeze(3).detach())

            for j, target in enumerate(
                    targets.detach().cpu().numpy().astype(int)):
                class_featmaps[target, class_idx[target]] = featmaps[j].view(
                    256 * 14 * 14).detach().cpu().numpy()
                class_featmaps_gen[target,
                                   class_idx[target]] = gen_image[j].view(
                                       256 * 14 * 14).detach().cpu().numpy()
                class_idx[target] += 1

        print(class_idx)
        plt.figure(figsize=(12, 12))
        plt.imshow(
            pairwise_distances(np.vstack(class_featmaps[0:10]), metric='l2'))
        plt.colorbar()
        plt.savefig("self.png")
        plt.close()

        print(class_idx)
        plt.figure(figsize=(12, 12))
        plt.imshow(
            pairwise_distances(np.vstack(class_featmaps_gen[0:10]),
                               metric='l2'))
        plt.colorbar()
        plt.savefig("self_gen.png")
        plt.close()

        plt.figure(figsize=(12, 12))
        plt.imshow(
            pairwise_distances(np.vstack(class_featmaps[0:10]),
                               np.vstack(class_featmaps_gen[0:10]),
                               metric='l2'))
        plt.colorbar()
        plt.savefig("pair.png")
        plt.close()

        plt.figure(figsize=(12, 12))
        plt.imshow(
            pairwise_distances(np.vstack(class_featmaps[0:10]),
                               metric='cosine'))
        plt.colorbar()
        plt.savefig("self_cosine.png")
        plt.close()

        plt.figure(figsize=(12, 12))
        plt.imshow(
            pairwise_distances(np.vstack(class_featmaps_gen[0:10]),
                               metric='cosine'))
        plt.colorbar()
        plt.savefig("self_gen_cosine.png")
        plt.close()

        plt.figure(figsize=(12, 12))
        plt.imshow(
            pairwise_distances(np.vstack(class_featmaps[0:10]),
                               np.vstack(class_featmaps_gen[0:10]),
                               metric='cosine'))
        plt.colorbar()
        plt.savefig("pair_cosine.png")
        plt.close()

        for i, (images, featmaps, targets, indexes) in enumerate(test_loader):
            loss = 0
            images, featmaps, targets = images.to(args.device), featmaps.to(
                args.device), targets.to(args.device)
            if args.data == "image":
                inputs = (images * 2) - 1
            else:
                inputs = featmaps
            feats, logits_cls, logits_adv = disc(inputs)
            loss_cls = cls_criterion(logits_cls, targets.long())
            loss = loss_cls
            _loss_cls += loss_cls.item()

            preds = F.softmax(logits_cls,
                              dim=1).argmax(dim=1).cpu().numpy().tolist()
            ypred.extend(preds)
            ytrue.extend(targets)

            feats, gen_targets = self._sample_vecs_index(inputs.shape[0])
            feats, gen_targets = feats.to(args.device), gen_targets.to(
                args.device)
            gen_image = gen(feats.unsqueeze(2).unsqueeze(3).detach())
            feats_gen, logits_cls_gen, logits_adv_gen = disc(gen_image)
            loss_cls_gen = cls_criterion(logits_cls_gen, gen_targets.long())
            loss += args.cls_w * loss_cls_gen
            _loss_cls_gen += loss_cls_gen.item()

            if args.adv:
                loss_adv = (adversarial_loss(logits_adv,
                                             is_real=True,
                                             is_disc=True,
                                             type_=args.adv_type) +
                            adversarial_loss(logits_adv_gen,
                                             is_real=False,
                                             is_disc=True,
                                             type_=args.adv_type))
                _loss_adv += loss_adv.item()
                loss += args.adv_w * loss_adv.clone() / 2.

                loss_adv_gen = adversarial_loss(logits_adv_gen,
                                                is_real=True,
                                                is_disc=False,
                                                type_=args.adv_type)
                _loss_adv_gen += loss_adv_gen.item()
                loss += args.adv_w * loss_adv_gen.clone()
            if args.recon:
                loss_recon = (1 - nn.CosineSimilarity(dim=1, eps=1e-6)(
                    feats_gen, feats).mean())
                loss += args.adv_r * loss_recon.clone()
                _loss_recon += loss_recon.item()
            if args.mse:
                loss_mse = nn.MSELoss()(gen_image, inputs)
                loss += args.mse_w * loss_mse.clone()
                _loss_mse += args.mse_w * loss_mse.item()

            preds_gen = F.softmax(logits_cls_gen,
                                  dim=1).argmax(dim=1).cpu().numpy().tolist()
            ypred_gen.extend(preds_gen)
            ytrue_gen.extend(gen_targets)
            _loss += loss.item()

            if i % 10 == 0:
                visualize(inputs[0],
                          gen_image[0],
                          out_dir=test_output_dir + str(epoch) + "_" + str(i) +
                          ".jpg",
                          featmap=(args.data == "featmap"))

            if sum(cls_count) < 50:
                cls_count = visualize_classes(inputs, gen_image, gen_targets,
                                              cls_count, test_output_dir,
                                              args.data == "featmap")

        acc = round((np.array(ypred) == np.array(ytrue)).sum() / len(ytrue), 4)
        acc_gen = round((np.array(ypred_gen) == np.array(ytrue_gen)).sum() /
                        len(ytrue_gen), 4)

        print("Test Set Epoch {}, Training Iteration {}".format(epoch, i))
        print("Accuracy: {}, Accuracy gen: {}".format(acc, acc_gen))
        print("Loss: {}, Loss_cls: {}, Loss_cls_gen: {}".format(
            _loss / (i + 1), _loss_cls / (i + 1), _loss_cls_gen / (i + 1)))
        if args.adv:
            print("Loss_adv: {}, Loss_adv_gen: {}".format(
                _loss_adv / (i + 1), _loss_adv_gen / (i + 1)))
        if args.mse:
            print("Loss_mse: {}".format(_loss_mse / (i + 1)))
        return return_statement(i, acc, acc_gen, _loss_cls, _loss_cls_gen,
                                _loss_adv, _loss_adv_gen, _loss_recon,
                                _loss_mse)
コード例 #39
0
def multi_component_layout(
    data,
    graph,
    n_components,
    component_labels,
    dim,
    random_state,
    metric="euclidean",
    metric_kwds={},
):
    """Specialised layout algorithm for dealing with graphs with many connected components.
    This will first fid relative positions for the components by spectrally embedding
    their centroids, then spectrally embed each individual connected component positioning
    them according to the centroid embeddings. This provides a decent embedding of each
    component while placing the components in good relative positions to one another.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data -- required so we can generate centroids for each
        connected component of the graph.

    graph: sparse matrix
        The adjacency matrix of the graph to be emebdded.

    n_components: int
        The number of distinct components to be layed out.

    component_labels: array of shape (n_samples)
        For each vertex in the graph the label of the component to
        which the vertex belongs.

    dim: int
        The chosen embedding dimension.

    metric: string or callable (optional, default 'euclidean')
        The metric used to measure distances among the source data points.

    metric_kwds: dict (optional, default {})
        Keyword arguments to be passed to the metric function.


    Returns
    -------
    embedding: array of shape (n_samples, dim)
        The initial embedding of ``graph``.
    """

    result = np.empty((graph.shape[0], dim), dtype=np.float32)

    if n_components > 2 * dim:
        meta_embedding = component_layout(
            data,
            n_components,
            component_labels,
            dim,
            random_state,
            metric=metric,
            metric_kwds=metric_kwds,
        )
    else:
        k = int(np.ceil(n_components / 2.0))
        base = np.hstack([np.eye(k), np.zeros((k, dim - k))])
        meta_embedding = np.vstack([base, -base])[:n_components]

    for label in range(n_components):
        component_graph = graph.tocsr()[component_labels == label, :].tocsc()
        component_graph = component_graph[:, component_labels == label].tocoo()

        distances = pairwise_distances([meta_embedding[label]], meta_embedding)
        data_range = distances[distances > 0.0].min() / 2.0

        if component_graph.shape[0] < 2 * dim:
            result[component_labels == label] = (random_state.uniform(
                low=-data_range,
                high=data_range,
                size=(component_graph.shape[0], dim),
            ) + meta_embedding[label])
            continue

        diag_data = np.asarray(component_graph.sum(axis=0))
        # standard Laplacian
        # D = scipy.sparse.spdiags(diag_data, 0, graph.shape[0], graph.shape[0])
        # L = D - graph
        # Normalized Laplacian
        I = scipy.sparse.identity(component_graph.shape[0], dtype=np.float64)
        D = scipy.sparse.spdiags(
            1.0 / np.sqrt(diag_data),
            0,
            component_graph.shape[0],
            component_graph.shape[0],
        )
        L = I - D * component_graph * D

        k = dim + 1
        num_lanczos_vectors = max(2 * k + 1,
                                  int(np.sqrt(component_graph.shape[0])))
        try:
            eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
                L,
                k,
                which="SM",
                ncv=num_lanczos_vectors,
                tol=1e-4,
                v0=np.ones(L.shape[0]),
                maxiter=graph.shape[0] * 5,
            )
            order = np.argsort(eigenvalues)[1:k]
            component_embedding = eigenvectors[:, order]
            expansion = data_range / np.max(np.abs(component_embedding))
            component_embedding *= expansion
            result[component_labels == label] = (component_embedding +
                                                 meta_embedding[label])
        except scipy.sparse.linalg.ArpackError:
            warn(
                "WARNING: spectral initialisation failed! The eigenvector solver\n"
                "failed. This is likely due to too small an eigengap. Consider\n"
                "adding some noise or jitter to your data.\n\n"
                "Falling back to random initialisation!")
            result[component_labels == label] = (random_state.uniform(
                low=-data_range,
                high=data_range,
                size=(component_graph.shape[0], dim),
            ) + meta_embedding[label])

    return result
コード例 #40
0
from io import BytesIO
import sframe as sf



?multivariate_normal.pdf
multivariate_normal.pdf(1,mean=0,cov=1)

print multivariate_normal.pdf([10,5],mean=[3,4],cov=3)
print norm.pdf(3)

data_pts=pd.DataFrame({'X':[10,2,3],'Y':[5,1,7]})
data_pts=np.array(data_pts)
clusters=np.array([[3,4],[6,3],[4,6]])

dist=pairwise_distances(data_pts,clusters,metric='euclidean')
?np.argmin
np.argmin(dist,axis=1)
l=list()
for i in clusters:
    l.append(multivariate_normal.pdf(data_pts,mean=i,cov=[[3,0],[0,3]]))

?np.array
l=np.array(l,shape=[3,3])
c_wts=[1./3,1./3,1./3]
l=l*c_wts
l=l.T
?normalize
res=normalize(l,norm='l1',axis=1)
c_wts_1=np.sum(res,axis=0)/np.sum(res)
コード例 #41
0
ファイル: leastsq.py プロジェクト: r4lv/VIP
def _leastsq_patch(ayxyx,  pa_thresholds, angles, metric, dist_threshold,
                   solver, tol):
    """ Helper function for _leastsq_ann.

    Parameters
    ----------
    axyxy : tuple
        This tuple contains all per-segment data.
    pa_thresholds : list of list
        This is a per-annulus list of thresholds. 
    angles, metric, dist_threshold, solver, tol
        These parameters are the same for each annulus or segment.
    """
    iann, yy, xx, yy_opt, xx_opt = ayxyx
    pa_threshold = pa_thresholds[iann]
    
    values = ARRAY[:, yy, xx]  # n_frames x n_pxs_segment
   
    values_opt = ARRAY[:, yy_opt, xx_opt]

    n_frames = ARRAY.shape[0]

    if dist_threshold < 100:
        mat_dists_ann_full = pairwise_distances(values, metric=metric)
    else:
        mat_dists_ann_full = np.ones((values.shape[0], values.shape[0]))

    if pa_threshold > 0:
        mat_dists_ann = np.zeros_like(mat_dists_ann_full)
        for i in range(n_frames):
            ind_fr_i = _find_indices_adi(angles, i, pa_threshold, None, False)
            mat_dists_ann[i][ind_fr_i] = mat_dists_ann_full[i][ind_fr_i]
    else:
        mat_dists_ann = mat_dists_ann_full

    matrix_res = np.zeros((values.shape[0], yy.shape[0]))
    for i in range(n_frames):
        vector = pn.DataFrame(mat_dists_ann[i])
        if vector.sum().values > 0:
            ind_ref = np.where(~np.isnan(vector))[0]
            A = values_opt[ind_ref]
            b = values_opt[i]
            if solver == 'lstsq':
                coef = sp.linalg.lstsq(A.T, b, cond=tol)[0]     # SVD method
            elif solver == 'nnls':
                coef = sp.optimize.nnls(A.T, b)[0]
            elif solver == 'lsq':   # TODO
                coef = sp.optimize.lsq_linear(A.T, b, bounds=(0, 1),
                                              method='trf',
                                              lsq_solver='lsmr')['x']
            else:
                raise ValueError("`solver` not recognized")
        else:
            msg = "No frames left in the reference set. Try increasing "
            msg += "`dist_threshold` or decreasing `delta_rot`."
            raise RuntimeError(msg)

        recon = np.dot(coef, values[ind_ref])
        matrix_res[i] = values[i] - recon

    return matrix_res, yy, xx
コード例 #42
0
def myfunction():
    #data from task 1
    randomData, stratifiedData, anotherX, targetForStrat, targetForRand, targetForOrg, attributeNames, latitude,longitude, stratLat, stratLong, numberInEachState, avArray = Task1.task1()
    #additional data
    stateFinancesData = ReadCSVFiles.readStates()
    energyData = ReadCSVFiles.statePopulation()
    medianHouseholdIncomeData = ReadCSVFiles.medianHouseholdIncome()
    unemploymentData = ReadCSVFiles.unemployment()
    averageIQData = ReadCSVFiles.averageIQ()
    educationData = ReadCSVFiles.educationLevel()

    bigData, avArray = ReadCSVFiles.getAllArray(stateFinancesData, energyData, medianHouseholdIncomeData, unemploymentData,averageIQData, educationData)


    # print(anotherX)
    # print(len(anotherX))
    # for i in range(0,20):
    #     print(anotherX[i])

    superTempArray = [None]*997
    #link state with other attributes
    for i in range(0,999):

        if(i<742):
            tempArray = [None]*22
            for j in range(0,9):
                tempArray[j] = anotherX[i][j]
            #get big Data items associated with state
            # tempArray = [None]*12

            tempNum = int(anotherX[i][9])-1
            for j in range(1,14):
                tempArray[j+8] = bigData[j][tempNum]

            superTempArray[i] = tempArray
        elif((i>742)and(i<923)):
            tempArray = [None]*22
            for j in range(0,9):
                tempArray[j] = anotherX[i][j]
            #get big Data items associated with state
            # tempArray = [None]*12

            tempNum = int(anotherX[i][9])-1
            for j in range(1,14):
                tempArray[j+8] = bigData[j][tempNum]

            superTempArray[i-1] = tempArray
        elif(i>923):
            tempArray = [None]*22
            for j in range(0,9):
                tempArray[j] = anotherX[i][j]
            #get big Data items associated with state
            # tempArray = [None]*12

            tempNum = int(anotherX[i][9])-1
            for j in range(1,14):
                tempArray[j+8] = bigData[j][tempNum]
            
            superTempArray[i-2] = tempArray


    anotherX = np.array(superTempArray)

    orgData_std = StandardScaler().fit_transform(anotherX)
 

    # print(orgData_std)
    # for i in range(0,len(orgData_std)):
    #     for j in range(0, len(orgData_std[i])):
    #         if(math.isnan(orgData_std[i][j])==True):
    #             print("TRUEE")
    # print(math.isnan(orgData_std))
    #pca = decomposition.PCA(n_components=3)
    pcaOrg = decomposition.PCA()

    # I transform the data and get respective eigenvalues
    sklearn_pcaOrg = pcaOrg.fit_transform(orgData_std)

    orgEigVal = pcaOrg.explained_variance_


    sumOfOrgEig =0

    contSumOfOrgEig = [None]*22

    #calculate sum of all eigenval
    for i in range(0,22):
        sumOfOrgEig = orgEigVal[i] + sumOfOrgEig

        contSumOfOrgEig[i] = sumOfOrgEig
        
    
    orgVarArray = [None]*22

    #calculate variance array
    for i in range(0,22):
        orgVarArray[i] = orgEigVal[i]/sumOfOrgEig

    sumOrgVar =0

    #get the sum of variances (total variance)
    for i in range(0,22):
        sumOrgVar = sumOrgVar + orgVarArray[i]

    tempOrgVarSum =0

    orgIntrDimCount = 0

    #get when 75% of the total variance occured
    for i in range(0,22):
        tempOrgVarSum = tempOrgVarSum + orgVarArray[i]
        if((tempOrgVarSum >(sumOrgVar*.75))and(orgIntrDimCount==0)):
            orgIntrDimCount = i

 
    #calculate loading factors
    pcaOrgNew = decomposition.PCA(n_components=orgIntrDimCount)

    sklearn_pcaOrg = pcaOrgNew.fit_transform(orgData_std)

    orgLoadFact = pcaOrgNew.components_.T * np.sqrt(pcaOrgNew.explained_variance_)


    orgSumOfSquaredLoad = [[0 for i in range(0,2)] for j in range(0,22)]

    #get attributes with highest PCA loading

    for i in range(0,22):
        for j in range(0, orgIntrDimCount):
            orgSumOfSquaredLoad[i] = orgSumOfSquaredLoad[i] + (orgLoadFact[i][j])**2   
            orgSumOfSquaredLoad[i][1]= i   

    #I sort the arrays
    orgSumOfSquaredLoad.sort(key=lambda x: x[0])
    

    #I get the highest 3 attributes
    orgThreeHighAttr = np.array(orgSumOfSquaredLoad[-3:])

    orgThreeHighAttrData = [[0 for i in range(0,3)] for j in range(0,997)]

    #I get the data associated with the three highest attribtues
    
    for j in range(0,997):
        for i in range(0,3):
            orgThreeHighAttrData[j][i] = orgData_std[j][int(orgThreeHighAttr[i][1])]

    #names of the three highest attributes
    orgColumns = [None]*3

    attributeNames = ['Change in Rank', 'Revenue', 'Revenue Change', 'Profit', 'Profit Change', 'Assets', 'Market Value', 'Employees', 'Years on Fortune 500 List',
                    'Total Revenue', 'Federal Revenue','State Revenue','Total Expenditure','Instruction Expenditure','GDP','Census','Median Household Income',
                    'Unemployment Rate','Average IQ','High School Graduate','Bachelors Graduate','Masters Graduate']
    for i in range(0,3):
        orgColumns[i] = (attributeNames[int(orgThreeHighAttr[i][1])])
    
    org3Data = pd.DataFrame(data = orgThreeHighAttrData, columns = orgColumns)

    targetForOrg2 = pd.DataFrame(data=targetForOrg, columns = ['Target'])

    #create the array with data points for 3 attr and cluster associated with that
    org3DataFinal = pd.concat([org3Data, targetForOrg2[['Target']]], axis=1)

    #create an array with coordinates for 3 attr scatter plot
    bigOrg3Array = [[0 for i in range(0,9)] for j in range (0,997)]


    for m in range(0,997):
        count =0
        for j in range(0,3):
            for i in range(0,3):
                bigOrg3Array[m][count]=([org3DataFinal.values[m][i],org3DataFinal.values[m][j]])
                count = count +1
    # print(bigOrg3Array)
    #to visualize data on top 2 pcaVectors
    pcaVisOrg = decomposition.PCA(n_components=2)

    principalDFOrg = pd.DataFrame(data = pcaVisOrg.fit_transform(orgData_std), columns = ['Principal Component 1', 'Principal Component 2'])

    # targetForStrat2 = pd.DataFrame(data=targetForStrat, columns = ['Target'])
    # targetForRand2 = pd.DataFrame(data=targetForRand, columns = ['Target'])
    targetForOrg2 = pd.DataFrame(data=targetForOrg, columns = ['Target'])
    #print(targetForStrat)
    #last row will show the cluster associated w/ each data point

    finalDFOrg = pd.concat([principalDFOrg, targetForOrg2[['Target']]], axis=1)

    
    #mds
    mds_dataOrg = manifold.MDS(n_components=2, dissimilarity='precomputed')

    #mds with euclidean
    orgSimEuc = pairwise_distances(orgData_std, metric= 'euclidean')
    # print("printing org sim euc")
    # print(orgSimEuc)

    orgDEuc = mds_dataOrg.fit_transform(orgSimEuc)

    orgMDSdatEuc = pd.DataFrame(orgDEuc)

    finalMDSOrgDataEuc = pd.concat([orgMDSdatEuc, targetForOrg2[['Target']]], axis=1)

    #mds with corr
    orgSimCor = pairwise_distances(orgData_std, metric= 'correlation')

    orgDCor = mds_dataOrg.fit_transform(orgSimCor)

    orgMDSdatCor = pd.DataFrame(orgDCor)

    finalMDSOrgDataCor = pd.concat([orgMDSdatCor, targetForOrg2[['Target']]], axis=1)



    #json data --> to export to front end
    data = {}

    data['orgEigVal'] = orgEigVal.tolist()
    data['orgLoadFact'] = orgLoadFact.tolist()
    data['orgSigNum'] = orgIntrDimCount  
    data['sumOfOrgEig'] = contSumOfOrgEig

    finalMDSOrgDataEuc = np.array(finalMDSOrgDataEuc).tolist()
    finalMDSOrgDataCor= np.array(finalMDSOrgDataCor).tolist()
    org3DataFinal = np.array(org3DataFinal).tolist()
    finalDFOrg = np.array(finalDFOrg).tolist()
    for i in range(0,2):
        finalMDSOrgDataEuc.pop()
        finalMDSOrgDataCor.pop()
        org3DataFinal.pop()
        finalDFOrg.pop()
    # print(finalMDSOrgDataCor)
    # print(org3DataFinal)
    # print(contSumOfOrgEig)

    # print("\n\n")

    # print(bigOrg3Array)
    data['pca2OrgValues'] = np.array(finalDFOrg).tolist()
    data['orgMDSDataEuc'] = finalMDSOrgDataEuc
    data['orgMDSDataCor'] = finalMDSOrgDataCor
    data['org3LoadData'] = org3DataFinal
    data['org3AttrNames'] = orgColumns
    data['bigOrg3Array'] = bigOrg3Array

    # print(bigOrg3Array)
    # print(data)
    json_data2 = json.dumps(data)
    # print(json_data2)
    return json_data2
コード例 #43
0
    paths = new_paths

print(f"Number of paths after subsampling: {len(paths)}")

# %% [markdown]
# ##

embedder = AdjacencySpectralEmbed(n_components=None, n_elbows=2)
embed = embedder.fit_transform(pass_to_ranks(adj))
embed = np.concatenate(embed, axis=-1)
pairplot(embed, labels=labels)

# %% [markdown]
# ## Show 2 dimensions of pairwise cosine embedding

pdist = pairwise_distances(embed, metric="cosine")

# %% [markdown]
# ##
manifold = TSNE(metric="precomputed")
# manifold = ClassicalMDS(n_components=2, dissimilarity="precomputed")
cos_embed = manifold.fit_transform(pdist)

# %% [markdown]
# ##
plot_df = pd.DataFrame(data=cos_embed)
plot_df["labels"] = labels

fig, axs = plt.subplots(1, 2, figsize=(20, 10))
ax = axs[0]
sns.scatterplot(
コード例 #44
0

def topic_result(tt):
    predicted_topics=[np.argsort(each)[::-1][0] for each in tt]
    if predicted_topics==[0]:
        return 'Family House'
    elif predicted_topics==[1]:
        return 'Tourism and Event'
    elif predicted_topics==[2]:
        return 'Peaceful Vacation'
    elif predicted_topics==[3]:
        return 'Young Life Style'
    else:
        return 'Specialty Needs'

similar_indices=pairwise_distances(tt,doc_topic,metric='cosine').argsort()[0][0:num]

similar_id = [(listings['id'].iloc[i]) for i in similar_indices]

similar_name = [(listings['name'].iloc[i]) for i in similar_indices]

similar_desc = [(listings['description'].iloc[i][0:165]) for i in similar_indices]

similar_url = [(listings['host_url'].iloc[i]) for i in similar_indices]

similar_pic = [(listings['picture_url'].iloc[i]) for i in similar_indices]

# def recommend(text,num):
#     a=('Recommending ' + str(num) + ' Airbnb products for ' + str(text))
#     b=('------------------------------------')
#     for i in list(range(0,num)):
コード例 #45
0
def magnitude_and_difference(matrix):
    magnitude = np.sqrt(matrix.multiply(matrix).sum(1))
    magnitude_diff = pairwise_distances(magnitude, metric='manhattan')
    return magnitude, magnitude_diff
コード例 #46
0
    def predict(self, X, categorical_mapping=None):

        # Scale the test set. Select wether to use the reference or
        # just the train set weights
        doRefWeightsScaling = False
        doSymmetricalWeights = True

        #if self.is_fitted:
        X_test = X.copy()

        # Prepare for the Euclidean distance
        r_test = X_test.shape[0]
        t_test = self.X_train.shape[0]
        X_temp = np.concatenate([X_test, self.X_train], axis=0)

        # Normalisation. If not z-score, normalise from 0 to 1
        if self.normalisation_type == 'z-score':
            x_normed = StandardScaler().fit_transform(X_temp)
        else:
            x_normed = (X_temp - X_temp.min(0)) / np.maximum(X_temp.ptp(0), 1)

        if doSymmetricalWeights:
            print('...Symmetrical Weights')
            # scale using Cholesky's so W = Q.T Q
            V = np.diag(self.x_combined_weights)
            try:
                Q = np.linalg.cholesky(V)
                weights_Q = np.diag(Q)
            except np.linalg.LinAlgError as err:
                print(err)
                doSymmetricalWeights = False

        if doRefWeightsScaling:
            X_test_scaled = np.multiply(x_normed[0:r_test, :],
                                        np.sqrt(self.x_ref_weights))
        elif doSymmetricalWeights:
            X_test_scaled = np.multiply(x_normed[0:r_test, :], weights_Q)
        else:
            X_test_scaled = np.multiply(x_normed[0:r_test, :],
                                        np.sqrt(self.x_weights))

        # Scale the training set
        if doSymmetricalWeights:
            X_train_scaled = np.multiply(x_normed[r_test:(r_test + t_test), :],
                                         weights_Q)
        else:
            X_train_scaled = np.multiply(x_normed[r_test:(r_test + t_test), :],
                                         np.sqrt(self.x_weights))

        y_k_all_list = []
        y_k_list = []
        y_k_weighted_list = []
        y_delta_list = []
        y_idx_closest_promos = []
        y_distances_closest_promos = []
        y_weights = []
        testSize = X_test.shape[0]

        for idx_test in range(0, testSize):

            # Select the closest promotions. Try scaling...
            current_promo_scaled = X_test_scaled[idx_test].reshape(1, -1)

            # >> Euclidean distances
            euclidean = np.squeeze(
                pairwise_distances(X_train_scaled, current_promo_scaled))
            idxSorted = np.argsort(euclidean)[0:self.num_neighbours]

            x_A = self.X_train[idxSorted]
            x_B = np.tile(X_test[idx_test], (self.num_neighbours, 1))
            X_AB_test = np.concatenate([x_A, x_B], axis=1)

            # differences regarding the reference promotions
            xgb_frc = self.regressor.predict(X_AB_test)

            # Get the average
            y_delta_list.append(xgb_frc)
            y_k_hat_all = xgb_frc + self.y_train[idxSorted]
            y_k_hat = np.mean(y_k_hat_all)

            # Weighted by the Euclidean distances
            w_distance = 1.0 / np.maximum(euclidean[idxSorted], 1e-3)
            y_k_hat_distances = \
              w_distance.dot(y_k_hat_all.T)/np.sum(w_distance)
            y_k_weighted_list.append(y_k_hat_distances)

            # Append to the list
            y_k_all_list.append(y_k_hat_all)
            y_k_list.append(y_k_hat)
            y_idx_closest_promos.append(idxSorted)
            y_distances_closest_promos.append(euclidean[idxSorted])
            y_weights.append(w_distance)

        # Arrange the forecast as np-arrays
        y_hat = np.array(y_k_list)
        y_hat_weighted = np.array(y_k_weighted_list)

        # Arrange the outputs
        self.results = {
            'y_idx_closest_promos': y_idx_closest_promos,
            'y_hat': y_hat,
            'y_hat_weighted': y_hat_weighted,
            'y_delta_list': y_delta_list,
            'y_k_all_list': y_k_all_list,
            'y_distances_closest_promos': y_distances_closest_promos,
            'y_weights': y_weights,
            'feat_importances': self.feat_importances,
            'internal_var_names': self.int_vars
        }

        # Sort out feature importances
        if not doRefWeightsScaling:
            idx_importances = np.argsort(self.x_combined_weights)[::-1]
            int_var_names = self.int_vars[0:self.numFeatures]
        else:
            idx_importances = np.argsort(self.feat_importances)[::-1]
            int_var_names = self.int_vars

        # If provided
        if categorical_mapping:
            '''
        Get the feature importances as a DF
      '''
            inputVars_plain = [
                categorical_mapping.get(iVar, iVar) for iVar in
                [int_var_names[this_idx] for this_idx in idx_importances]
            ]
            df_feat_importances = pd.DataFrame(
                self.feat_importances[idx_importances], index=inputVars_plain)
        else:
            inputVars_plain = [
                int_var_names[this_idx] for this_idx in idx_importances
            ]
            df_feat_importances = pd.DataFrame(
                self.x_combined_weights[idx_importances],
                index=inputVars_plain)

        self.results['df_feat_importances'] = df_feat_importances

        self.valid_predictions = True

        return y_hat_weighted
コード例 #47
0
    # Load table
    disp('Loading table...')
    df = pd.read_csv(inputpath, index_col=0)
    disp('%s samples and %s features detected' % (df.shape[1], df.shape[0]))
    samples = df.columns

    # Compute pairwise distances
    if 'unifrac' in args.m:
        features = df.index
        metric = setup_unifrac(args.d, df.index, args.m)
    elif (args.m in _METRICS_NAMES) & (
            args.m not in PAIRWISE_DISTANCE_FUNCTIONS.keys()):
        import scipy.spatial.distance as sd
        metric = getattr(sd, args.m)
    else:
        metric = args.m
    disp('Calculating pairwise distances...')
    dists = pairwise_distances(df.T, metric=metric, n_jobs=args.t)

    # Place into dataframe
    dist_df = pd.DataFrame(dists, samples, samples)

    # Save
    disp('Saving pairwise distances to: %s' % outputpath)

    # Create output directory if it doesn't exist
    outdir = os.path.dirname(outputpath)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    dist_df.to_csv(outputpath)
コード例 #48
0
def rotate(X, angle):
    theta = np.deg2rad(angle)
    R = [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]]
    return np.dot(X, R)


X2 = rotate(X, 20) + 5
plt.scatter(X2[:, 0], X2[:, 1], **colorize)
plt.axis('equal')
plt.show()

# calculated distance matrix

from sklearn.metrics import pairwise_distances
D = pairwise_distances(X)
print(D.shape)

plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.show()

# MDS: distance matrix ---- coordinate representation

from sklearn.manifold import MDS
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis('equal')
plt.show()
コード例 #49
0
    def __getitem__(self, item):

        if self.cache_data:

            if item in self.data_dict.keys():
                return self.data_dict[item]

            else:
                pass       
 
        
        pdbid, pose, affinity = self.data_list[item]

        node_feats, coords = None, None
        with h5py.File(self.data_file, "r") as f:

            if (
                not self.dataset_name
                in f[
                    "{}/{}/{}".format(
                        pdbid, self.feature_type, self.preprocessing_type
                    )
                ].keys()
            ):
                print(pdbid)
                return None

            if self.use_docking:
                # TODO: the next line will cuase runtime error because not selelcting poses
                data = f[
                    "{}/{}/{}/{}".format(
                        pdbid,
                        self.feature_type,
                        self.preprocessing_type,
                        self.dataset_name,
                    )
                ][pose]["data"]
                vdw_radii = (
                    f[

                        "{}/{}/{}/{}".format(
                            pdbid,
                            self.feature_type,
                            self.preprocessing_type,
                            self.dataset_name,
                        )
                    ][pose]
                    .attrs["van_der_waals"]
                    .reshape(-1, 1)
                )

            else:
                data = f[
                    "{}/{}/{}/{}".format(
                        pdbid,
                        self.feature_type,
                        self.preprocessing_type,
                        self.dataset_name,
                    )
                ]["data"]
                vdw_radii = (
                    f[

                        "{}/{}/{}/{}".format(
                            pdbid,
                            self.feature_type,
                            self.preprocessing_type,
                            self.dataset_name,
                        )

                    ]
                    .attrs["van_der_waals"]
                    .reshape(-1, 1)
                )

            if self.feature_type == "pybel":
                coords = data[:, 0:3]
                node_feats = np.concatenate([vdw_radii, data[:, 3:22]], axis=1)

            else:
                raise NotImplementedError

        # account for the vdw radii in distance cacluations (consider each atom as a sphere, distance between spheres)

        dists = pairwise_distances(coords, metric="euclidean")

        edge_index, edge_attr = dense_to_sparse(torch.from_numpy(dists).float())

        x = torch.from_numpy(node_feats).float()

        y = torch.FloatTensor(affinity).view(-1, 1)
        data = Data(
            x=x, edge_index=edge_index, edge_attr=edge_attr.view(-1, 1), y=y
        )


        if self.cache_data:

            if self.output_info:
                self.data_dict[item] = (pdbid, pose, data)

            else:
                self.data_dict[item] = data

            return self.data_dict[item]

        else:
            if self.output_info:
                return (pdbid, pose, data)
            else:
                return data
コード例 #50
0
def get_distances_centers(centers):
    return pairwise_distances(centers)
コード例 #51
0
from sklearn.metrics import pairwise_distances

from src.datasets.datasets import Spheres

dataset = Spheres()
dataset_l, labels_l = dataset.sample(n_samples=8)

DD = pairwise_distances(dataset_l, dataset_l)

print(DD.max())
コード例 #52
0
def test_dunn():
    kmeans = KMeans(n_clusters=2, random_state=0)
    labels = kmeans.fit_predict(iris)
    d_val = dunn(pairwise_distances(iris), labels)
    assert .05 < d_val < .1
コード例 #53
0
def get_cluster_diameter(cluster):
    distance = pairwise_distances(cluster)
    return max(map(lambda x: x[len(distance) - 1], distance))
コード例 #54
0
user_book = user_book.replace(np.nan, 0)

# In[29]:

user_book

# In[30]:

#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

# In[31]:

user_sim = 1 - pairwise_distances(user_book.values, metric='cosine')

# In[32]:

user_sim

# In[33]:

#Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)

# In[34]:

#Set the index and column names to user ids
user_sim_df.index = book_df["User.ID"].unique()
user_sim_df.columns = book_df["User.ID"].unique()
コード例 #55
0
def get_prediction(vecs, pics):
    dists = pairwise_distances(vecs, pics, metric='cosine')
    return dists.T.argsort(1)
コード例 #56
0
# Generamos muestra estratificada
split = StratifiedShuffleSplit(n_splits=1, test_size=0.45, random_state=42)
for train_index, test_index in split.split(df_users_to_cluster, df_users_to_cluster['divisionGiro']):
    strat_test_set = df_users_to_cluster.iloc[train_index]
    users_clustered = df_users_to_cluster.iloc[test_index]

print('------------MUESTRA ESTRATIFICADA-----------')
print(users_clustered['divisionGiro'].value_counts() / len(users_clustered))

print('Tamaño de datos a procesar', len(users_clustered))

# Calculo de la matriz de distancias con metrica gower
start = time.time()
gower_mat = metrics.pairwise_distances(
    gower.gower_matrix(users_clustered),
    metric="precomputed")
end = time.time()
print('Tiempo de ejecución de la matriz de distancia',end - start)

# Comparación de matriz de distancia con el primer registro
first_row = gower_mat[0]
print('Registro base: ',users_clustered[:1], "\n")
print('Registro mas parecido: ',users_clustered[first_row == min(first_row[first_row != min(first_row)])])
print('Registro menos parecido: ',users_clustered[first_row == max(first_row)])

# Implementamos el modelo de DBSCAN con la matriz ya computada
start = time.time()
model = DBSCAN(eps=0.11, min_samples=40, metric="precomputed")
model.fit(gower_mat)
end = time.time()
コード例 #57
0
    def compute_neighbors(
        self,
        n_neighbors: int = 30,
        knn: bool = True,
        n_pcs: Optional[int] = None,
        use_rep: Optional[str] = None,
        method: str = 'umap',
        random_state: Optional[Union[RandomState, int]] = 0,
        write_knn_indices: bool = False,
        metric: str = 'euclidean',
        metric_kwds: Mapping[str, Any] = {}
    ) -> None:
        """\
        Compute distances and connectivities of neighbors.

        Parameters
        ----------
        n_neighbors
             Use this number of nearest neighbors.
        knn
             Restrict result to `n_neighbors` nearest neighbors.
        {n_pcs}
        {use_rep}

        Returns
        -------
        Writes sparse graph attributes `.distances` and `.connectivities`.
        Also writes `.knn_indices` and `.knn_distances` if
        `write_knn_indices==True`.
        """
        if n_neighbors > self._adata.shape[0]:  # very small datasets
            n_neighbors = 1 + int(0.5*self._adata.shape[0])
            logg.warn('n_obs too small: adjusting to `n_neighbors = {}`'
                      .format(n_neighbors))
        if method == 'umap' and not knn:
            raise ValueError('`method = \'umap\' only with `knn = True`.')
        if method not in {'umap', 'gauss'}:
            raise ValueError('`method` needs to be \'umap\' or \'gauss\'.')
        if self._adata.shape[0] >= 10000 and not knn:
            logg.warn(
                'Using high n_obs without `knn=True` takes a lot of memory...')
        self.n_neighbors = n_neighbors
        self.knn = knn
        X = choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs)
        # neighbor search
        use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or knn == False
        if use_dense_distances:
            # standard eulcidean case for relatively small matrices
            self._distances, knn_indices, knn_distances = compute_neighbors_numpy(
                X, n_neighbors, knn=knn)
        else:
            # non-euclidean case and approx nearest neighbors
            if X.shape[0] < 4096:
                X = pairwise_distances(X, metric=metric, **metric_kwds)
                metric = 'precomputed'
            knn_indices, knn_distances = compute_neighbors_umap(
                X, n_neighbors, random_state, metric=metric, metric_kwds=metric_kwds)
        # write indices as attributes
        if write_knn_indices:
            self.knn_indices = knn_indices
            self.knn_distances = knn_distances
        logg.msg('computed neighbors', t=True, v=4)
        if not use_dense_distances or method == 'umap':
            # we need self._distances also for method == 'gauss' if we didn't
            # use dense distances
            self._distances, self._connectivities = compute_connectivities_umap(
                knn_indices, knn_distances, self._adata.shape[0], self.n_neighbors)
        # overwrite the umap connectivities if method is 'gauss'
        # self._distances is unaffected by this
        if method == 'gauss':
            self._compute_connectivities_diffmap()
        logg.msg('computed connectivities', t=True, v=4)
        self._number_connected_components = 1
        if issparse(self._connectivities):
            from scipy.sparse.csgraph import connected_components
            self._connected_components = connected_components(self._connectivities)
            self._number_connected_components = self._connected_components[0]
コード例 #58
0
ファイル: distances.py プロジェクト: pancitysim/trackintel
def calculate_distance_matrix(X,
                              Y=None,
                              dist_metric='haversine',
                              n_jobs=0,
                              **kwds):
    """
    Calculate a distance matrix based on a specific distance metric.
    
    If only X is given, the pair-wise distances between all elements in X are calculated. If X and Y are given, the
    distances between all combinations of X and Y are calculated. Distances between elements of X and X, and distances
    between elements of Y and Y are not calculated.

    Parameters
    ----------
    X : GeoDataFrame (as trackintel staypoints or triplegs)
        
    Y : GeoDataFrame (as trackintel staypoints or triplegs), optional
        
    dist_metric: {'haversine', 'euclidean', 'dtw', 'frechet'}
        The distance metric to be used for calculating the matrix. This function wraps around the
        ``pairwise_distance`` function from scikit-learn if only `X` is given and wraps around the
        ``scipy.spatial.distance.cdist`` function if X and Y are given. Therefore the following metrics 
        are also accepted:
        
        via ``scikit-learn``: `[‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]`
        
        via ``scipy.spatial.distance``: `[‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’,
        ‘kulsinski’, ‘mahalanobis’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’,
        ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]`
        
        triplegs can only be used in combination with `['dtw', 'frechet']`.
        
    n_jobs: int
        Number of cores to use: 'dtw', 'frechet' and all distance metrics from `pairwise_distance` (only available 
        if only X is given) are parallelized.
         
    **kwds: 
        optional keywords passed to the distance functions.

    Returns
    -------
    np.array
        matrix of shape (len(X), len(X)) or of shape (len(X), len(Y))
        
    """
    geom_type = X.geometry.iat[0].geom_type
    if Y is None:
        Y = X
    assert Y.geometry.iat[0].geom_type == Y.geometry.iat[0].geom_type, "x and y need same geometry type " \
                                                                       "(only first column checked)"

    if geom_type == 'Point':
        x1 = X.geometry.x.values
        y1 = X.geometry.y.values
        x2 = Y.geometry.x.values
        y2 = Y.geometry.y.values

        if dist_metric == 'haversine':
            # create point pairs for distance calculation
            nx = len(X)
            ny = len(Y)

            # if y != x they could have different dimensions
            if ny >= nx:
                ix_1, ix_2 = np.triu_indices(nx, k=1, m=ny)
                trilix = np.tril_indices(nx, k=-1, m=ny)
            else:
                ix_1, ix_2 = np.tril_indices(nx, k=-1, m=ny)
                trilix = np.triu_indices(nx, k=1, m=ny)

            x1 = x1[ix_1]
            y1 = y1[ix_1]
            x2 = x2[ix_2]
            y2 = y2[ix_2]

            d = haversine_dist(x1, y1, x2, y2)

            D = np.zeros((nx, ny))
            D[(ix_1, ix_2)] = d

            # mirror triangle matrix to be conform with scikit-learn format and to
            # allow for non-symmetric distances in the future
            D[trilix] = D.T[trilix]

        else:
            xy1 = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1)),
                                 axis=1)

            if Y is not None:
                xy2 = np.concatenate((x2.reshape(-1, 1), y2.reshape(-1, 1)),
                                     axis=1)
                D = cdist(xy1, xy2, metric=dist_metric, **kwds)
            else:
                D = pairwise_distances(xy1, metric=dist_metric, n_jobs=n_jobs)

        return D

    elif geom_type == 'LineString':

        if dist_metric in ['dtw', 'frechet']:
            # these are the preparation steps for all distance functions based only on coordinates

            if dist_metric == 'dtw':
                d_fun = partial(dtw, **kwds)

            elif dist_metric == 'frechet':
                d_fun = partial(frechet_dist, **kwds)

            # get combinations of distances that have to be calculated
            nx = len(X)
            ny = len(Y)

            if ny >= nx:
                ix_1, ix_2 = np.triu_indices(nx, k=1, m=ny)
                trilix = np.tril_indices(nx, k=-1, m=ny)
            else:
                ix_1, ix_2 = np.tril_indices(nx, k=-1, m=ny)
                trilix = np.triu_indices(nx, k=1, m=ny)

            left = list(X.iloc[ix_1].geometry)
            right = list(Y.iloc[ix_2].geometry)

            # map the combinations to the distance function
            if n_jobs == -1 or n_jobs > 1:
                if n_jobs == -1:
                    n_jobs = multiprocessing.cpu_count()
                with multiprocessing.Pool(processes=n_jobs) as pool:
                    left_right = list(zip(left, right))
                    d = np.array(list(pool.starmap(d_fun, left_right)))
            else:
                d = np.array(list(map(d_fun, left, right)))

            # write results to (symmetric) distance matrix
            D = np.zeros((nx, ny))
            D[(ix_1, ix_2)] = d
            D[trilix] = D.T[trilix]
            return D

        else:
            raise AttributeError(
                "Metric unknown. We only support ['dtw', 'frechet'] for LineStrings. "
                f"You passed {dist_metric}")
    else:
        raise AttributeError(
            f"We only support 'Point' and 'LineString'. Your geometry is {geom_type}"
        )
コード例 #59
0
def component_layout(
    data,
    n_components,
    component_labels,
    dim,
    random_state,
    metric="euclidean",
    metric_kwds={},
):
    """Provide a layout relating the separate connected components. This is done
    by taking the centroid of each component and then performing a spectral embedding
    of the centroids.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data -- required so we can generate centroids for each
        connected component of the graph.

    n_components: int
        The number of distinct components to be layed out.

    component_labels: array of shape (n_samples)
        For each vertex in the graph the label of the component to
        which the vertex belongs.

    dim: int
        The chosen embedding dimension.

    metric: string or callable (optional, default 'euclidean')
        The metric used to measure distances among the source data points.

    metric_kwds: dict (optional, default {})
        Keyword arguments to be passed to the metric function.
        If metric is 'precomputed', 'linkage' keyword can be used to specify
        'average', 'complete', or 'single' linkage. Default is 'average'

    Returns
    -------
    component_embedding: array of shape (n_components, dim)
        The ``dim``-dimensional embedding of the ``n_components``-many
        connected components.
    """
    if data is None:
        # We don't have data to work with; just guess
        return np.random.random(size=(n_components, dim)) * 10.0

    component_centroids = np.empty((n_components, data.shape[1]),
                                   dtype=np.float64)

    if metric == "precomputed":
        # cannot compute centroids from precomputed distances
        # instead, compute centroid distances using linkage
        distance_matrix = np.zeros((n_components, n_components),
                                   dtype=np.float64)
        linkage = metric_kwds.get("linkage", "average")
        if linkage == "average":
            linkage = np.mean
        elif linkage == "complete":
            linkage = np.max
        elif linkage == "single":
            linkage = np.min
        else:
            raise ValueError("Unrecognized linkage '%s'. Please choose from "
                             "'average', 'complete', or 'single'" % linkage)
        for c_i in range(n_components):
            dm_i = data[component_labels == c_i]
            for c_j in range(c_i + 1, n_components):
                dist = linkage(dm_i[:, component_labels == c_j])
                distance_matrix[c_i, c_j] = dist
                distance_matrix[c_j, c_i] = dist
    else:
        for label in range(n_components):
            component_centroids[label] = data[component_labels == label].mean(
                axis=0)

        if scipy.sparse.isspmatrix(component_centroids):
            warn(
                "Forcing component centroids to dense; if you are running out of "
                "memory then consider increasing n_neighbors.")
            component_centroids = component_centroids.toarray()

        if metric in SPECIAL_METRICS:
            distance_matrix = pairwise_special_metric(
                component_centroids,
                metric=metric,
                kwds=metric_kwds,
            )
        elif metric in SPARSE_SPECIAL_METRICS:
            distance_matrix = pairwise_special_metric(
                component_centroids,
                metric=SPARSE_SPECIAL_METRICS[metric],
                kwds=metric_kwds,
            )
        else:
            if callable(metric) and scipy.sparse.isspmatrix(data):
                function_to_name_mapping = {
                    v: k
                    for k, v in sparse_named_distances.items()
                }
                try:
                    metric_name = function_to_name_mapping[metric]
                except KeyError:
                    raise NotImplementedError(
                        "Multicomponent layout for custom "
                        "sparse metrics is not implemented at "
                        "this time.")
                distance_matrix = pairwise_distances(component_centroids,
                                                     metric=metric_name,
                                                     **metric_kwds)
            else:
                distance_matrix = pairwise_distances(component_centroids,
                                                     metric=metric,
                                                     **metric_kwds)

    affinity_matrix = np.exp(-(distance_matrix**2))

    component_embedding = SpectralEmbedding(
        n_components=dim, affinity="precomputed",
        random_state=random_state).fit_transform(affinity_matrix)
    component_embedding /= component_embedding.max()

    return component_embedding
コード例 #60
0
len(book.User_ID.unique())
len(book.Book_Title.unique())

# convert data into n * p matrix
Book_pivot = book.pivot_table(index='User_ID',
                              columns='Book_Title',
                              values='Book_Rating').reset_index(drop=True)

# replace index of pivot with user id values
Book_pivot.index = book.User_ID.unique()

#Impute NaNs with 0 values
Book_pivot.fillna(0, inplace=True)

# calculate distance
# pairwise gives (1-cos(A,B)) so to negate that we are using 1- ahead of that
user_distance = 1 - pairwise_distances(Book_pivot.values, metric='cosine')

distance_matrix = pd.DataFrame(user_distance)

#Set the index and column names to user ids
distance_matrix.index = book.User_ID.unique()
distance_matrix.columns = book.User_ID.unique()

# fill diagnols with 0s as diagnol has all 1

np.fill_diagonal(distance_matrix.values, 0)

#Most Similar Users
distance_matrix.idxmax(axis=1)[0:5]