Ejemplo n.º 1
0
def test_kneighbors_regressor_sparse(n_samples=40,
                                     n_features=5,
                                     n_test_pts=10,
                                     n_neighbors=5,
                                     random_state=0):
    # Test radius-based regression on sparse matrices
    # Like the above, but with various types of sparse matrices
    rng = np.random.RandomState(random_state)
    X = 2 * rng.rand(n_samples, n_features) - 1
    y = ((X ** 2).sum(axis=1) < .25).astype(np.int)

    for sparsemat in SPARSE_TYPES:
        knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
                                            algorithm='auto')
        knn.fit(sparsemat(X), y)

        knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
                                                metric='precomputed')
        knn_pre.fit(pairwise_distances(X, metric='euclidean'), y)

        for sparsev in SPARSE_OR_DENSE:
            X2 = sparsev(X)
            assert_true(np.mean(knn.predict(X2).round() == y) > 0.95)

            X2_pre = sparsev(pairwise_distances(X, metric='euclidean'))
            if issparse(sparsev(X2_pre)):
                assert_raises(ValueError, knn_pre.predict, X2_pre)
            else:
                assert_true(
                    np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95)
Ejemplo n.º 2
0
def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
    """Expresses to what extent the local structure is retained.

    The trustworthiness is within [0, 1]. It is defined as

    .. math::

        T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
            \sum_{j \in U^{(k)}_i (r(i, j) - k)}

    where :math:`r(i, j)` is the rank of the embedded datapoint j
    according to the pairwise distances between the embedded datapoints,
    :math:`U^{(k)}_i` is the set of points that are in the k nearest
    neighbors in the embedded space but not in the original space.

    * "Neighborhood Preservation in Nonlinear Projection Methods: An
      Experimental Study"
      J. Venna, S. Kaski
    * "Learning a Parametric Embedding by Preserving Local Structure"
      L.J.P. van der Maaten

    Parameters
    ----------
    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
        If the metric is 'precomputed' X must be a square distance
        matrix. Otherwise it contains a sample per row.

    X_embedded : array, shape (n_samples, n_components)
        Embedding of the training data in low-dimensional space.

    n_neighbors : int, optional (default: 5)
        Number of neighbors k that will be considered.

    precomputed : bool, optional (default: False)
        Set this flag if X is a precomputed square distance matrix.

    Returns
    -------
    trustworthiness : float
        Trustworthiness of the low-dimensional embedding.
    """
    if precomputed:
        dist_X = X
    else:
        dist_X = pairwise_distances(X, squared=True)
    dist_X_embedded = pairwise_distances(X_embedded, squared=True)
    ind_X = np.argsort(dist_X, axis=1)
    ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]

    n_samples = X.shape[0]
    t = 0.0
    ranks = np.zeros(n_neighbors)
    for i in range(n_samples):
        for j in range(n_neighbors):
            ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
        ranks -= n_neighbors
        t += np.sum(ranks[ranks > 0])
    t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
                          (2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
    return t
Ejemplo n.º 3
0
def pairwise_distances(X, Y=None, index=None, metric="euclidean"):
    '''
    Compute the distance matrix from a vector array X and optional Y.
    This method takes either a vector array or a distance matrix,
    and returns a distance matrix. If the input is a vector array,
    the distances are computed. If the input is a distances matrix,
    it is returned instead.
    This method provides a safe way to take a distance matrix as input,
    while preserving compatibility with many other algorithms that take
    a vector array.

    :param X:  array [n_samples_a, n_samples_a]
        Array of pairwise distances between samples, or a feature array.
    :param Y:   array [n_samples_b, n_features]
        A second feature array only if X has shape [n_samples_a, n_features].
    :param index:  int, the index of element in X array
    :param metric: The metric to use when calculating distance between instances in a feature array.
        If metric ='rmsd', it should be computed by MDTraj
    :return: The distances
    '''
    if metric == "rmsd":
        if Y is None:
            distances_ = md.rmsd(X, X, index, parallel=True, precentered=True)
        else:
            #distances_ = np.empty((len(X), len(Y)), dtype=np.float32)
           # for i in xrange(len(Y)):
            distances_ = md.rmsd(X, Y, index, parallel=True, precentered=True)
        return distances_
    else:
        if Y is None:
            print "if Y is None"
            return sp.pairwise_distances(X, X[index], metric=metric)
        if index is None:
            print "if index is None, pairwise XX"
            return sp.pairwise_distances(X, X, metric=metric)
Ejemplo n.º 4
0
def make_rbf(x,sigma,metric='euclidean', x2=None):
    if x.ndim == 1:
        x = np.expand_dims(x, 1)
    if x2 is None:
        x2 = x
    if metric == 'cosine':
        #This code may be faster for some matrices
        # Code from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
        '''
        tic()
        #x = x.toarray()
        #similarity = np.dot(x, x.T)
        similarity = (x.dot(x.T)).toarray()
        square_mag = np.diag(similarity)
        inv_square_mag = 1 / square_mag
        inv_square_mag[np.isinf(inv_square_mag)] = 0
        inv_mag = np.sqrt(inv_square_mag)
        W = similarity * inv_mag
        W = W.T * inv_mag
        W = 1 - W
        toc()
        tic()
        W2 = pairwise.pairwise_distances(x,x,metric)
        toc()
        '''
        W = pairwise.pairwise_distances(x,x2,metric)
    else:
        #tic()
        W = pairwise.pairwise_distances(x,x2,metric)
        #toc()
    W = np.square(W)
    W = -sigma * W
    W = np.exp(W)
    return W
Ejemplo n.º 5
0
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict):
    ## stats parameters 
    quantiles_range = np.arange(0, 1.5, 0.5)
    stats_func = [ np.mean, np.std ]
    stats_feat_num = len(quantiles_range) + len(stats_func)
    n_class_relevance = 13
    
    if metric == "cosine":
        stats_feat = 0 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
        sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
    elif metric == "euclidean":
        stats_feat = -1 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
        sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)

    print("pairwise_distances generated!")
    for i in range(len(ids_test)):
        id = ids_test[i]
        for j in range(n_class_relevance):
            key = j
            if key in indices_dict:
                inds = indices_dict[key]
                # exclude this sample itself from the list of indices
                inds = [ ind for ind in inds if id != ids_train[ind] ]
                sim_tmp = sim[i][inds]
                if len(sim_tmp) != 0:
                    feat = [ func(sim_tmp) for func in stats_func ]
                    ## quantile
                    sim_tmp = pd.Series(sim_tmp)
                    quantiles = sim_tmp.quantile(quantiles_range)
                    feat = np.hstack((feat, quantiles))
                    stats_feat[i,j*stats_feat_num:(j+1)*stats_feat_num] = feat
    return stats_feat
    def dunn(max_nc, all_labels, dataset):
        dunn = []
        print "DUNN (MAX)..."
        for nc in xrange(2, max_nc + 1):
            dn = 0.0
            max_intra = 0.0
            for cluster_i in xrange(nc):
                instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
                pairwase_matrix_intra = pairwise_distances(instances_i, n_jobs=1)
                new_max_intra = np.amax(pairwase_matrix_intra)
                if new_max_intra > max_intra:
                    max_intra = new_max_intra
            for cluster_i in xrange(nc):
                instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
                for cluster_j in xrange(nc):
                    if cluster_j > cluster_i:
                        instances_j = dataset[np.where(all_labels[nc - 2] == cluster_j)[0]]
                        pairwase_matrix_inter = pairwise_distances(instances_i, instances_j, n_jobs=1)
                        min_inter = np.amin(pairwase_matrix_inter)

                        if dn == 0.0:
                            dn = min_inter / max_intra
                        elif min_inter / max_intra < dn:
                            dn = min_inter / max_intra
            print 'DUNN for k = ' + str(nc) + ' is ' + str(dn) + ' ...'
            dunn += [dn]
        return dunn
Ejemplo n.º 7
0
def bipartite_clustering(D2W,word_cluster_num,doc_cluster_num,metric,criteria):
	W2D = D2W.transpose()
	W2WC = kmean(W2D,word_cluster_num,criteria)
	#word_cluster_num = np.amax(W2WC)+1
	#print "wc:",word_cluster_num
	for loop in range(4):
		#D2WC = D2W.dot(transform_from_index_array(W2WC,W2WC.size,word_cluster_num))
		#print D2WC
		#print loop
		new_centroids = get_new_centroids(W2D,W2WC)
		new_distance_matrix = pairwise_distances(W2D,new_centroids,metric=metric) #how to calculate distance? maybe 1-matrix?
		#print new_distance_matrix
		D2WC = D2W.dot(new_distance_matrix)
		if loop==0:
			D2DC = kmean(D2WC,doc_cluster_num,criteria)
		else:
			new_centroids = get_new_centroids(D2WC,D2DC)
			D2DC = kmean(D2WC,doc_cluster_num,criteria,new_centroids)
		#doc_cluster_num = np.amax(D2DC)+1
		#print "dc:",doc_cluster_num
		new_centroids = get_new_centroids(D2W,D2DC)
		new_distance_matrix = pairwise_distances(D2W,new_centroids,metric=metric) 
		W2DC = W2D.dot(new_distance_matrix)
		new_centroids = get_new_centroids(W2DC,W2WC)
		W2WC = kmean(W2DC,word_cluster_num,criteria,new_centroids)
		#word_cluster_num = np.amax(W2WC)+1
		#print "wc:",word_cluster_num 
	return D2DC,W2WC
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict, qids_test=None):
    if metric == "cosine":
        stats_feat = 0 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
        sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
    elif metric == "euclidean":
        stats_feat = -1 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
        sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)

    for i in range(len(ids_test)):
        id = ids_test[i]
        if qids_test is not None:
            qid = qids_test[i]
        for j in range(n_classes):
            key = (qid, j + 1) if qids_test is not None else j + 1
            if indices_dict.has_key(key):
                inds = indices_dict[key]
                # exclude this sample itself from the list of indices
                inds = [ind for ind in inds if id != ids_train[ind]]
                sim_tmp = sim[i][inds]
                if len(sim_tmp) != 0:
                    feat = [func(sim_tmp) for func in stats_func]
                    ## quantile
                    sim_tmp = pd.Series(sim_tmp)
                    quantiles = sim_tmp.quantile(quantiles_range)
                    feat = np.hstack((feat, quantiles))
                    stats_feat[i, j * stats_feat_num:(j + 1) * stats_feat_num] = feat
    return stats_feat
Ejemplo n.º 9
0
def test_no_data_conversion_warning():
    # No warnings issued if metric is not a boolean distance function
    rng = np.random.RandomState(0)
    X = rng.randn(5, 4)
    with pytest.warns(None) as records:
        pairwise_distances(X, metric="minkowski")
    assert len(records) == 0
Ejemplo n.º 10
0
    def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20):
        if ftr_type == 'ftr':
            #use input features
            self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))]
            dataMat = [ts.ftr for ts in self.slctData]
        elif ftr_type == 'data':
            #use input data
            dataMat = [ts.val for ts in self.slctData]
        else:
            print 'unknown ftr_type for ftr_type:', ftr_type
        if pca_dim > len(dataMat):
            pca_dim = int(math.ceil(len(dataMat)/2.0))

        if type  == 'euclidean': #euclidean distance based on time series data
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance
            pca = skd.PCA(n_components=pca_dim)
            dataMat = pca.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance
            nmf = skd.NMF(n_components=pca_dim)
            dataMat = nmf.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance
            ica = skd.FastICA(n_components=pca_dim)
            dataMat = ica.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type =='cosine':
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance
            pca = skd.PCA(n_components=pca_dim)
            dataMat = pca.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance
            nmf = skd.NMF(n_components=pca_dim)
            dataMat = nmf.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type =='ica_cos': #extract feature based on ICA, then use cosine distance
            ica = skd.FastICA(n_components=pca_dim)
            dataMat = ica.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        else:
            print 'unknown type for similarity matrix: ', type

        #rearrange the order of data in simMat
        self.slctDataMat = dataMat
        if orderFlag:
            link = spc.hierarchy.linkage(self.simMat)
            dend = spc.hierarchy.dendrogram(link, no_plot=True)
            order = dend['leaves']
            self.slctData = [self.slctData[i] for i in order] #rearrange order
            self.simMat = [self.simMat[i] for i in order]
            for i in xrange(len(self.simMat)):
                self.simMat[i] = [self.simMat[i][j] for j in order]
            self.slctDataMat = [self.slctDataMat[i] for i in order]
        # self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering
        self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response
        self.clstData = self.slctData
        self.clstSimMat = self.simMat
Ejemplo n.º 11
0
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
def predict(dialogue_session, line):
    lowest = ('x',1)
    data = dataDict[dialogue_session][1][line,:]
    for vector in vDict:
        predictor = vDict[vector]
        if pair.pairwise_distances(predictor,data,'cosine') < lowest[1]:
            lowest = (vector, pair.pairwise_distances(predictor,data,'cosine'))
    return lowest
Ejemplo n.º 13
0
    def cramer_statistic(self, n_jobs=1):
        '''
        Applies the Cramer Statistic to the datasets.

        Parameters
        ----------
        n_jobs : int, optional
            Sets the number of cores to use to calculate
            pairwise distances. Default is 1.
        '''
        # Adjust what we call n,m based on the larger dimension.
        # Then the looping below is valid.
        if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
            m = self.data_matrix1.shape[0]
            n = self.data_matrix2.shape[0]
            larger = self.data_matrix1
            smaller = self.data_matrix2
        else:
            n = self.data_matrix1.shape[0]
            m = self.data_matrix2.shape[0]
            larger = self.data_matrix2
            smaller = self.data_matrix1

        pairdist11 = pairwise_distances(larger, metric="euclidean",
                                        n_jobs=n_jobs)
        pairdist22 = pairwise_distances(smaller, metric="euclidean",
                                        n_jobs=n_jobs)
        pairdist12 = pairwise_distances(larger, smaller,
                                        metric="euclidean", n_jobs=n_jobs)

        # Take sqrt of each
        # We default to using the Cramer kernel in Baringhaus & Franz (2004)
        # \phi(dist) = sqrt(dist) / 2.
        # The normalization values below reflect this
        pairdist11 = np.sqrt(pairdist11)
        pairdist12 = np.sqrt(pairdist12)
        pairdist22 = np.sqrt(pairdist22)

        term1 = 0.0
        term2 = 0.0
        term3 = 0.0
        for i in range(m):
            for j in range(n):
                term1 += pairdist12[i, j]
            for ii in range(m):
                term2 += pairdist11[i, ii]

            if i < n:
                for jj in range(n):
                    term3 += pairdist22[i, jj]

        m, n = float(m), float(n)

        term1 *= (1 / (m * n))
        term2 *= (1 / (2 * m ** 2.))
        term3 *= (1 / (2 * n ** 2.))

        self._distance = (m * n / (m + n)) * (term1 - term2 - term3)
Ejemplo n.º 14
0
    def run_step(self, run_number, step_size, howlong):
        dfslot = self.get_input_slot("df")
        df = dfslot.data()
        dfslot.update(run_number)
        if dfslot.has_updated() or dfslot.has_deleted():
            dfslot.reset()
            logger.info("Reseting history because of changes in the input df")
            dfslot.update(run_number, df)
            # TODO: be smarter with changed values

        m = step_size

        indices = dfslot.next_created(m)
        m = indices_len(indices)

        i = None
        j = None
        Si = self._buf.matrix()

        arrayslot = self.get_input_slot("array")
        if arrayslot is not None and arrayslot.data() is not None:
            array = arrayslot.data()
            logger.debug("Using array instead of DataFrame columns")
            if Si is not None:
                i = array[self._last_index]
            j = array[indices]
        if j is None:
            if self.columns is None:
                self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN))
            elif not isinstance(self.columns, pd.Index):
                self.columns = pd.Index(self.columns)
            rows = df[self.columns]
            if Si is not None:
                i = rows.loc[self._last_index]
                assert len(i) == len(self._last_index)
            j = rows.loc[fix_loc(indices)]
            assert len(j) == indices_len(indices)

        Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs)
        if Si is None:
            mat = self._buf.resize(Sj.shape[0])
            mat[:, :] = Sj
            self._last_index = dfslot.last_index[indices]
        else:
            Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs)
            n0 = i.shape[0]
            n1 = n0 + j.shape[0]
            mat = self._buf.resize(n1)
            mat[0:n0, n0:n1] = Sij
            mat[n0:n1, 0:n0] = Sij.T
            mat[n0:n1, n0:n1] = Sj
            self._last_index = self._last_index.append(df.index[indices])
            # truth = pairwise_distances(array[0:n1], metric=self._metric)
            # import pdb
            # pdb.set_trace()
            # assert np.allclose(mat,truth)
        return self._return_run_step(dfslot.next_state(), steps_run=m)
Ejemplo n.º 15
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))
Ejemplo n.º 16
0
	def fit(self, X, y=None, c=None):
		"""Fit the model using X as training data.

		Parameters
		----------
		X : array, shape (n_samples, n_features) or (n_samples, n_samples)
			If the metric is 'precomputed' X must be a square distance
			matrix. Otherwise it contains a sample per row.
		"""
		X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64)
		random_state = check_random_state(self.random_state)

		if self.early_exaggeration < 1.0:
			raise ValueError("early_exaggeration must be at least 1, but is "
							 "%f" % self.early_exaggeration)

		if self.n_iter < 200:
			raise ValueError("n_iter should be at least 200")

		if self.metric == "precomputed":
			if self.init == 'pca':
				raise ValueError("The parameter init=\"pca\" cannot be used "
								 "with metric=\"precomputed\".")
			if X.shape[0] != X.shape[1]:
				raise ValueError("X should be a square distance matrix")
			distances = X
		else:
			if self.verbose:
				print("[t-SNE] Computing pairwise distances...")

			if self.metric == "euclidean":
				distances = pairwise_distances(X, metric=self.metric, squared=True)
			else:
				distances = pairwise_distances(X, metric=self.metric)

		# Degrees of freedom of the Student's t-distribution. The suggestion
		# alpha = n_components - 1 comes from "Learning a Parametric Embedding
		# by Preserving Local Structure" Laurens van der Maaten, 2009.
		alpha = max(self.n_components - 1.0, 1)
		n_samples = X.shape[0]
		self.training_data_ = X

		P = _joint_probabilities(distances, self.perplexity, self.verbose)
		self.P = deepcopy(P)
		if self.init == 'pca':
			pca = RandomizedPCA(n_components=self.n_components,
								random_state=random_state)
			X_embedded = pca.fit_transform(X)
		elif self.init == 'random':
			X_embedded = None
		else:
			raise ValueError("Unsupported initialization scheme: %s"
							 % self.init)

		self.embedding_ = self._tsne(P, alpha, n_samples, random_state,
									 X_embedded=X_embedded, c=c)
Ejemplo n.º 17
0
    def multiQuadricKernel(X, X2=None, offset=1.0, jobs=1, *args, **kwargs):

        offset = float(offset)
        if X2 is not None:
            distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs)
        else:
            distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs)
        result = np.sqrt(distanceMatrix**2 + offset**2)

        return result
Ejemplo n.º 18
0
def knn_dist(x, x_ctrl, s=100, p=1):
    x_tmp = random_subsample(x_ctrl, 200000, replace=False)
    xs = kmeans_subsample(x_tmp, s)
    if p == 1:
        min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l1"), axis=1)
    elif p == 2:
        min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l2"), axis=1)
    assert len(min_dist) == x.shape[0]

    return min_dist
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False):
    '''

    Computes clustering of bag-of-words vectors of articles

    INPUT
    folder      model folder
    nclusters   number of clusters

    '''
    from sklearn.cluster import KMeans
    # filtering out some noise words
    stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:])

    # vectorize non-stopwords 
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)

    # creating bow-index-to-word map
    idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))

    # using now stopwords and filtering out digits
    print 'Computing pairwise distances' 
    K = pairwise_distances(X,metric='l2',n_jobs=1)
    perc = 50.0
    width = percentile(K.flatten(),perc)

    # KPCA transform bow vectors
    Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X)
    
    if zscored:
        Xc = zscore(Xc)
    
    # compute clusters
    km = KMeans(n_clusters=nclusters).fit(Xc)
    Xc = km.predict(Xc)

    clusters = []
    for icluster in range(nclusters):
        nmembers = (Xc==icluster).sum()
        if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big
            members = (Xc==icluster).nonzero()[0]
            topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1]
            topwords = ' '.join([idx2word[wi] for wi in topwordidx])
            meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum()
            meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0)
            # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords
            clusters.append({
                'name':'Cluster-%d'%icluster,
                'description': topwords,
                'members': list(members),
                'meanL2Distances': meanDist
                })

    return clusters
Ejemplo n.º 20
0
    def cauchyKernel(X, X2, sigma=1.0, jobs=1, *args, **kwargs):

        sigma = float(sigma)

        if X2 is not None:
            distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs)
        else:
            distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs)
        result = 1 / (1 + distanceMatrix**2 / sigma**2)

        return result
Ejemplo n.º 21
0
    def cramer_statistic(self, n_jobs=1):
        '''
        Applies the Cramer Statistic to the datasets.

        Parameters
        ----------

        n_jobs : int, optional
            Sets the number of cores to use to calculate
            pairwise distances
        '''
        # Adjust what we call n,m based on the larger dimension.
        # Then the looping below is valid.
        if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
            m = self.data_matrix1.shape[0]
            n = self.data_matrix2.shape[0]
            larger = self.data_matrix1
            smaller = self.data_matrix2
        else:
            n = self.data_matrix1.shape[0]
            m = self.data_matrix2.shape[0]
            larger = self.data_matrix2
            smaller = self.data_matrix1

        pairdist11 = pairwise_distances(
            larger, metric="euclidean", n_jobs=n_jobs)
        pairdist22 = pairwise_distances(
            smaller, metric="euclidean", n_jobs=n_jobs)
        pairdist12 = pairwise_distances(
            larger, smaller,
            metric="euclidean", n_jobs=n_jobs)

        term1 = 0.0
        term2 = 0.0
        term3 = 0.0
        for i in range(m):
            for j in range(n):
                term1 += pairdist12[i, j]
            for ii in range(m):
                term2 += pairdist11[i, ii]

            if i < n:
                for jj in range(n):
                    term3 += pairdist22[i, jj]

        m, n = float(m), float(n)

        term1 *= (1 / (m * n))
        term2 *= (1 / (2 * m ** 2.))
        term3 *= (1 / (2 * n ** 2.))

        self.distance = (m * n / (m + n)) * (term1 - term2 - term3)

        return self
Ejemplo n.º 22
0
    def rationalQuadraticKernel(X, X2=None, offset=1.0, jobs=1,
                                *args, **kwargs):

        assert (offset > 0)
        offset = float(offset)

        if X2 is not None:
            distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs)
        else:
            distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs)
        result = 1 - distanceMatrix**2 / (distanceMatrix**2 + offset)

        return result
Ejemplo n.º 23
0
    def fit(self, X, y):
        X,y=check_X_y(X,y)
        # X, y = check_arrays(X, y, sparse_format="csr")
        # y = column_or_1d(y, warn=True)
        n_samples, n_features = X.shape
        classes = np.unique(y)
        self.classes_ = classes
        n_classes = classes.size
        if n_classes < 2:
            raise ValueError('y has fewer than 2 classes')

        if len(self.centers_)>0:
            assert len(self.centers_[0])==n_features

        radii=[]
        count=[]
        # first pass - only need the radii, because the vectors and the targets are already stored
        pass_number=0
        i=0
        for v,t in zip(X,y):  
            v=v.reshape(1, -1)

            D=pairwise_distances(v,X).ravel()
            r=max(D[y!=t].min()-1e-10,1e-10)
            radii.append(r)
    
            within=D[y==t]<=r
            count.append(within.sum())
    
            i+=1

        radii=np.array(radii)
        count=np.array(count)

        # second pass
        for v,t in zip(X,y): # Go through all of the data points
            #Select the sphere that contains that point, 
            # and the largest number of other points, 
            # and add it to the final spheres list
            v=v.reshape(1, -1)
            D=pairwise_distances(v,X).ravel()
            within_centers=(D<=radii)
            matched=(t==y) & (within_centers)
            idx=np.arange(len(y))
            idx_matched=idx[matched]
            best=idx_matched[np.argmax(count[matched])]
    
    
            self._add_center(X[best],radii[best],y[best])
        
            pass_number+=1
Ejemplo n.º 24
0
    def laplacianKernel(X, X2=None, sigma=1.0, cutoff=None, jobs=1,
                        *args, **kwargs):

        sigma = float(sigma)

        if X2 is not None:
            distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs)
        else:
            distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs)
        result = np.exp(-distanceMatrix / sigma)
        if cutoff is not None:
            result[result < cutoff] = 0

        return result
Ejemplo n.º 25
0
def kneighbors(X, n_neighbors, metric):
     """Finds the K-neighbors of a point.
     Based on sklearn
     Returns distance ind
    """
     if metric == 'abs_correlation': 
         dist = pairwise_distances(X, metric = 'correlation')
         dist = np.sqrt( -np.log( np.abs(1 - dist )) )
     else:     
         dist = pairwise_distances(X, metric = metric)
         
         
     neigh_ind = dist.argsort(axis=1)
     neigh_ind = neigh_ind[:, :n_neighbors]
     return neigh_ind
Ejemplo n.º 26
0
def jaccard_distance(X, Y=None, n_jobs=-1, **kwds):
    """ Computes the Jaccard distance between all the pairs of vectors in X.
    If X is not sparse the function defaults to sklearn.metrics.pairwise.pairwise_distances
    If Y is given distances between X and Y are computed

    The Jaccard index is defined as the intersection / union of items in the vector (that is 
    non-sparse indices, regardless of their magnitude)

    Parameters:
    -----------
    X: an array (dims: samples X features)
    Y: an optional array (dims: samples_2 X features)
    n_jobs: optionally run on multiple cores
    **kwds: additional parameters to sklearn.metrics.pairwise.pairwise_distances

    Returns a square distance matrix. All elements are in [0, 1]. 
    One may use 1-R for similarity (where R is the return value)

    Examples:
        >>> from scipy.sparse import csr_matrix
        >>> from numpy import matrix
        >>> d = matrix([[1, 1, 1, 0, 0], [0, 1, 1, 0, 1]])
        >>> s = csr_matrix(d)
        >>> jaccard_distance(s)
        matrix([[ 0. ,  0.5],
                [ 0.5,  0. ]])
        >>> jaccard_distance(s, [1, 1, 1, 0, 1])
        matrix([[ 0.25],
                [ 0.25]])
    """
    if Y is None:
        Y = X
    if sparse.issparse(X):
        if not sparse.issparse(Y):
            Y = sparse.csr_matrix(Y)
        mmx = (X!=0)
        mmy = (Y!=0)
        mx = sparse.csr_matrix((np.ones_like(mmx.data, dtype=np.double), mmx.indices, mmx.indptr), shape=mmx.shape)
        if X is Y:
            my = mx
        else:
            my = sparse.csr_matrix((np.ones_like(mmy.data, dtype=np.double), mmy.indices, mmy.indptr), shape=mmy.shape)
        m_int = mx * my.T
        m_uni = pairwise_distances(mx, my, metric='manhattan', n_jobs=n_jobs, **kwds)
        m_uni += m_int
        return 1.0 - (m_int / m_uni)
    else:
        return pairwise_distances(X, metric='jaccard', n_jobs=n_jobs, **kwds)
Ejemplo n.º 27
0
def find_reference(raw, n_cluster, pick_types=None, copy=True,
                   flat_threshold=1e-15, n_split=100, plot=True):
    """ Computes covariance on splits of the raw data, and apply KMeans
    clustering to find the number of disjoint references.
    n_cluster is found with PCA if float
    """
    import matplotlib.pyplot as plt
    from pyriemann.estimation import Covariances
    from sklearn.cluster import KMeans
    from sklearn.metrics.pairwise import pairwise_distances

    if copy:
        raw = raw.copy()
    # Remove flat lines
    flat = np.where(np.std(raw._data, axis=1) < flat_threshold)[0]
    for ch in flat:
        raw.info['bads'] += [raw.ch_names[ch]]

    # Pick data channels only
    if pick_types is None:
        pick_types = dict(seeg=True, exclude='bads')
    raw.pick_types(**pick_types)

    # Compute covariance on data splits
    n_time = len(raw.times)
    t_max = raw.times[n_time - n_time % n_split - 1]
    raw.crop(0, t_max, copy=False)  # ensure regularly sized splits
    X = np.array(np.array_split(raw._data, n_split, axis=1))
    covs = Covariances().fit_transform(X)

    # Compute cluster for each data split
    cluster = KMeans(n_cluster)
    all_kmeans = list()
    for cov in covs:
        dist = pairwise_distances(cov)
        all_kmeans.append(cluster.fit_predict(dist))

    # Combine clusters
    dist = pairwise_distances(np.array(all_kmeans).T)
    idx = cluster.fit_predict(dist)

    if plot:
        idx_ = np.argsort(idx)
        cov = np.median(covs, axis=0)
        plt.matshow(np.log10(cov)[idx_, :][:, idx_])

    clusters = [np.array(raw.ch_names)[idx == ii] for ii in np.unique(idx)]
    return clusters
Ejemplo n.º 28
0
def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
            n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
Ejemplo n.º 29
0
def fuzzy_c_means(points, num_centers, m=2., tol=1e-4, max_iter=100,
                  verbose=False):
  '''Uses Fuzzy C-Means to downsample `points`.
  m : aggregation parameter >1, larger implies smoother clusters
  Returns indices of downsampled points.
  '''
  num_points = points.shape[0]
  if num_centers >= num_points:
    return np.arange(num_points)
  # randomly initialize cluster assignments matrix
  assn = np.random.random((points.shape[0], num_centers))
  # iterate assignments until they converge
  for i in range(max_iter):
    # compute centers
    w = assn ** m
    w /= w.sum(axis=0)
    centers = w.T.dot(points)
    # calculate new assignments
    d = pairwise_distances(points, centers)
    d **= 2. / (m - 1)
    np.maximum(d, 1e-10, out=d)
    new_assn = 1. / np.einsum('ik,ij->ik', d, 1./d)
    # check for convergence
    change = np.linalg.norm(new_assn - assn)
    if verbose:
      print('At iteration %d: change = %g' % (i+1, change))
    if change < tol:
      break
    assn = new_assn
  else:
    warnings.warn("fuzzy_c_means didn't converge in %d iterations" % max_iter)
  # find points closest to the selected cluster centers
  return d.argmin(axis=0)
Ejemplo n.º 30
0
def fh_dist_lines(li1, li2):
    """
    Compute a cheap distance (based on hausdorff-distance) between
    *li1* and *li2*, two LineString.

    Parameters
    ----------
    li1: shapely.geometry.LineString
    li2: shapely.geometry.LineString

    Returns
    -------
    max_dist: Float of the distance between li1 and li2.

    """
    coord_li1 = np.array([i for i in zip(li1.coords.xy[0], li1.coords.xy[1])])
    coord_li2 = np.array([i for i in zip(li2.coords.xy[0], li2.coords.xy[1])])
    if len(coord_li2) > len(coord_li2):
        coord_li1, coord_li2 = coord_li2, coord_li1
    dist_mat = pairwise_distances(
        coord_li1, coord_li2, metric='euclidean', n_jobs=2
        )
    chkl = round(len(coord_li1)/len(coord_li2))
    return max(
        [dist_mat[i, j] for i, j in zip(
            list(range(len(coord_li1))),
            list(nrepeat(range(len(coord_li2)), chkl))[:len(coord_li1)])]
        )
Ejemplo n.º 31
0
    def _fit(self, X, skip_num_points=0):
        """Fit the model using X as training data.

        Note that sparse arrays can only be handled by method='exact'.
        It is recommended that you convert your sparse array to dense
        (e.g. `X.toarray()`) if it fits in memory, or otherwise using a
        dimensionality reduction technique (e.g. TruncatedSVD).

        Parameters
        ----------
        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
            If the metric is 'precomputed' X must be a square distance
            matrix. Otherwise it contains a sample per row. Note that this
            when method='barnes_hut', X cannot be a sparse array and if need be
            will be converted to a 32 bit float array. Method='exact' allows
            sparse arrays and 64bit floating point inputs.

        skip_num_points : int (optional, default:0)
            This does not compute the gradient for points with indices below
            `skip_num_points`. This is useful when computing transforms of new
            data where you'd like to keep the old data fixed.
        """
        if self.method not in ['barnes_hut', 'exact']:
            raise ValueError("'method' must be 'barnes_hut' or 'exact'")
        if self.angle < 0.0 or self.angle > 1.0:
            raise ValueError("'angle' must be between 0.0 - 1.0")
        if self.method == 'barnes_hut' and sp.issparse(X):
            raise TypeError('A sparse matrix was passed, but dense '
                            'data is required for method="barnes_hut". Use '
                            'X.toarray() to convert to a dense numpy array if '
                            'the array is small enough for it to fit in '
                            'memory. Otherwise consider dimensionality '
                            'reduction techniques (e.g. TruncatedSVD)')
        else:
            X = check_array(X,
                            accept_sparse=['csr', 'csc', 'coo'],
                            dtype=np.float64)
        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError("early_exaggeration must be at least 1, but is "
                             "%f" % self.early_exaggeration)

        if self.n_iter < 200:
            raise ValueError("n_iter should be at least 200")

        if self.metric == "precomputed":
            if isinstance(self.init, string_types) and self.init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be used "
                                 "with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")
            distances = X
        else:
            if self.verbose:
                print("[t-SNE] Computing pairwise distances...")

            if self.metric == "euclidean":
                distances = pairwise_distances(X,
                                               metric=self.metric,
                                               squared=True)
            else:
                distances = pairwise_distances(X, metric=self.metric)

        if not np.all(distances >= 0):
            raise ValueError("All distances should be positive, either "
                             "the metric or precomputed distances given "
                             "as X are not correct")

        # Degrees of freedom of the Student's t-distribution. The suggestion
        # degrees_of_freedom = n_components - 1 comes from
        # "Learning a Parametric Embedding by Preserving Local Structure"
        # Laurens van der Maaten, 2009.
        degrees_of_freedom = max(self.n_components - 1.0, 1)
        n_samples = X.shape[0]
        # the number of nearest neighbors to find
        k = min(n_samples - 1, int(3. * self.perplexity + 1))

        neighbors_nn = None
        if self.method == 'barnes_hut':
            if self.verbose:
                print("[t-SNE] Computing %i nearest neighbors..." % k)
            if self.metric == 'precomputed':
                # Use the precomputed distances to find
                # the k nearest neighbors and their distances
                neighbors_nn = np.argsort(distances, axis=1)[:, :k]
            elif self.rho >= 1:
                # Find the nearest neighbors for every point
                bt = BallTree(X)
                # LvdM uses 3 * perplexity as the number of neighbors
                # And we add one to not count the data point itself
                # In the event that we have very small # of points
                # set the neighbors to n - 1
                distances_nn, neighbors_nn = bt.query(X, k=k + 1)
                neighbors_nn = neighbors_nn[:, 1:]
            elif self.rho < 1:
                # Use pyFLANN to find the nearest neighbors
                myflann = FLANN()
                testset = X
                params = myflann.build_index(testset,
                                             algorithm="autotuned",
                                             target_precision=self.rho,
                                             log_level='info')
                neighbors_nn, distances = myflann.nn_index(
                    testset, k + 1, checks=params["checks"])
                neighbors_nn = neighbors_nn[:, 1:]

            P = _joint_probabilities_nn(distances, neighbors_nn,
                                        self.perplexity, self.verbose)
        else:
            P = _joint_probabilities(distances, self.perplexity, self.verbose)
        assert np.all(np.isfinite(P)), "All probabilities should be finite"
        assert np.all(P >= 0), "All probabilities should be zero or positive"
        assert np.all(P <= 1), ("All probabilities should be less "
                                "or then equal to one")

        if isinstance(self.init, np.ndarray):
            X_embedded = self.init
        elif self.init == 'pca':
            pca = PCA(n_components=self.n_components,
                      svd_solver='randomized',
                      random_state=random_state)
            X_embedded = pca.fit_transform(X)
        elif self.init == 'random':
            X_embedded = None
        else:
            raise ValueError("Unsupported initialization scheme: %s" %
                             self.init)

        return self._tsne(P,
                          degrees_of_freedom,
                          n_samples,
                          random_state,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)
Ejemplo n.º 32
0
def build_all():
    # For each class, we build all the trees and save them in CSVs
    nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative')
    write_tree_in_csv(nar_trees)    
    
    arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative/')
    write_tree_in_csv(arg_trees) 
     
    inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/')
    write_tree_in_csv(inf_trees) 
    
    des_trees = []
    #des_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/')
    #write_tree_in_csv(des_trees) 
    
    
    # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier.
    all_trees = nar_trees + arg_trees + inf_trees + des_trees
    int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'}

    path_to_save = '~/Documents/s2/tal/discourseAnalysis/data/'
    y_nar = [0 for t in nar_trees]
    y_arg = [1 for t in arg_trees]
    y_inf = [2 for t in inf_trees]
    y_des = [3 for t in des_trees]
    y = np.array( y_nar + y_arg + y_inf + y_des )
    pickle.dump(y,open(path_to_save+'labels_test.pkl','wb'))

    T = [t[0] for t in all_trees]
    pickle.dump(T,open(path_to_save+'trees_test.pkl','wb'))
    
    index = ['bin','count','norm','height','tfid']

    #Dicts
    D_bin = vectorizers.build_bin_vects(T)
    D_count = vectorizers.build_count_vects(T)
    D_norm = vectorizers.build_norm_vects(T)
    D_height = vectorizers.build_height_vects(T)
    D_tfid = vectorizers.build_tfid_vects(T)
    
    D_df = pd.DataFrame([D_bin,D_count,D_norm,D_height,D_tfid],index=index)
    D_df = D_df.transpose()
    D_df.to_pickle(path_to_save+'dicts_test.pkl')
    

    #Vects
    vectorizer = feature_extraction.DictVectorizer(sparse=False)
    V_bin = vectorizer.fit_transform(D_bin)
    V_count = vectorizer.fit_transform(D_count)
    V_norm = vectorizer.fit_transform(D_norm)
    V_height = vectorizer.fit_transform(D_height)
    V_tfid = vectorizer.fit_transform(D_tfid)

    V_all = np.zeros((len(index),V_bin.shape[0],V_bin.shape[1]))
    V_all = np.array([V_bin,V_count,V_norm,V_height,V_tfid])
    V_df = []
    for i in range(V_all.shape[1]):
        d = {}
        for j,v in enumerate(V_all[:,i]):
            d[index[j]]=v
        V_df.append(d)
    V_df = pd.DataFrame(V_df)
    V_df.to_pickle(path_to_save+'vects_test.pkl')
    
    #euclidean distance
    K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean')
    K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean')
    K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean')
    K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean')
    K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean')
    K_all_eucl_dist = [K_bin_eucl_dist, K_count_eucl_dist, K_norm_eucl_dist, K_height_eucl_dist, K_tfid_eucl_dist]
    
    K_all = {'eucl_dist':K_all_eucl_dist}
    pickle.dump(K_all,open(path_to_save+'kernels_test.pkl','wb'))
Ejemplo n.º 33
0
 def _convert_to_similarity_matrix(self, embeddings, metric=None):
     # if self.metric.lower() == 'euclidean':
     if metric is None:
         metric = self.metric.lower()
     #return squareform(pdist(embeddings,metric=metric))
     return pairwise_distances(embeddings, metric=metric)
Ejemplo n.º 34
0
ratings = pd.DataFrame(df_train.groupby('title')['rating'].mean())
ratings['number_of_ratings'] = df_train.groupby('title')['rating'].count()
movie_matrix = df_train.pivot_table(index='user_id',
                                    columns='title',
                                    values='rating')

#Calculate similarity
train_data_matrix = np.zeros((n_users, n_items))
for row in df_train.itertuples():
    train_data_matrix[row[1] - 1, row[2] - 1] = row[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in df_test.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")

#Calculate popularity
item_popular = {}
for i in range(n_items):
    if np.sum(train_data_matrix[:, i]) != 0:
        item_popular[i] = np.sum(train_data_matrix[:, i] != 0)
item_count = len(item_popular)

#Similarity compensates for each person's rating habits and the popularity of movies
rate = train_data_matrix.mean(axis=1)
rate2 = (train_data_matrix - rate[:, np.newaxis])
pred_user = rate[:, np.newaxis] + user_similarity.dot(rate2) / np.array(
    [np.abs(user_similarity).sum(axis=1)]).T
pred_item = train_data_matrix.dot(item_similarity) / np.array(
Ejemplo n.º 35
0
    resultfile.close()
header1 = ['user_id', 'item_id', 'rating']

df = pd.read_csv('D://recommendation//new//movielens1M.txt', sep='\t', names=header1,index_col=False)
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 

from sklearn import cross_validation as cv
dataset,_ = cv.train_test_split(df, test_size=0)
originalMatrix = np.zeros((n_users, n_items))
for line in dataset.itertuples():
    originalMatrix[int(line[1])-1, int(line[2])-1] = line[3]  

from sklearn.metrics.pairwise import pairwise_distances
cosine_distance = pairwise_distances(originalMatrix.T, metric='cosine')


sim = 1-cosine_distance




#based on similarity
label = np.array(
    [
        5,16,9,12,12,16,7,15,19,16,7,12,15,6,19,1,1,9,9,9,5,1,19,9,1,8,15,8,6,2,12,5,10,5,8,1,10,15,5,2,6,9,8,9,1,6,5,12,2,5,17,1,10,15,8,10,6,6,10,9,19,1,15,15,12,15,2,2,9,16,19,8,8,15,10,19,2,8,19,2,6,8,2,2,6,19,15,12,9,10,4,19,12,6,16,13,2,2,2,6,6,12,19,7,6,10,12,2,1,5,11,16,8,13,15,2,8,15,2,10,2,9,8,2,6,15,10,10,2,10,2,19,10,10,12,2,13,13,10,12,7,3,12,6,9,2,6,8,8,5,1,8,16,4,8,8,12,12,6,16,16,6,16,6,16,8,10,9,15,9,8,9,16,12,6,6,0,2,8,9,12,8,8,8,16,12,8,0,15,8,19,10,9,6,12,16,2,16,17,2,8,2,12,9,8,8,12,16,15,2,2,10,2,2,6,9,15,12,2,18,13,6,1,1,9,10,19,2,6,1,7,6,6,12,5,9,9,15,12,15,15,2,10,15,2,6,6,12,6,15,15,9,16,2,15,9,19,15,19,5,6,8,2,2,6,1,12,8,8,15,8,6,9,15,12,12,19,8,8,9,6,6,10,8,2,10,2,16,12,8,15,16,1,8,15,5,15,2,8,1,2,8,16,15,6,6,6,6,2,10,10,12,15,6,9,16,7,5,6,8,2,6,15,2,15,8,9,0,16,0,2,0,7,8,8,10,1,9,7,19,2,1,15,7,6,6,8,6,16,1,12,8,16,19,9,5,5,19,2,12,9,12,2,5,8,0,5,16,8,9,9,1,6,12,8,9,5,12,16,5,6,9,9,19,13,2,19,8,2,2,15,2,19,15,2,10,10,10,10,2,3,10,15,10,19,10,0,15,15,7,15,6,9,12,9,19,6,15,12,9,15,19,9,19,8,0,19,6,12,15,1,7,15,16,16,9,12,12,10,5,1,16,2,8,19,6,8,6,15,9,8,2,15,5,9,8,5,19,19,15,8,2,19,9,8,16,15,6,8,15,1,8,12,5,6,2,6,15,19,5,1,19,8,15,16,15,15,8,12,9,9,6,19,16,2,2,1,2,19,7,8,12,13,19,12,8,9,1,1,15,19,0,19,9,6,9,9,9,19,7,6,8,8,9,2,2,5,19,1,13,6,9,9,6,6,19,6,6,7,9,5,12,7,19,10,9,19,9,8,12,5,16,16,2,1,8,10,12,10,10,2,1,13,15,8,13,2,2,12,2,8,10,10,6,12,10,15,10,3,8,8,10,2,13,7,7,5,5,5,5,10,5,5,7,7,7,7,10,4,8,3,2,15,15,12,0,13,5,15,16,0,19,8,2,2,12,2,13,15,2,8,19,10,10,4,15,6,1,13,13,15,2,2,10,8,10,12,13,8,19,10,10,3,3,2,14,1,16,10,8,3,2,16,10,10,15,10,10,2,3,9,9,9,13,2,13,15,2,2,2,9,3,9,14,9,8,2,6,13,17,2,10,17,10,2,8,10,12,15,10,15,19,15,19,19,10,19,10,9,12,13,2,10,15,2,10,14,7,12,15,15,13,18,8,2,15,10,2,9,6,14,2,17,9,19,13,10,6,10,10,8,13,16,10,18,16,9,19,2,2,19,0,12,10,6,17,15,16,13,11,2,18,2,10,13,10,10,10,2,2,19,9,13,8,12,6,2,9,10,10,10,18,9,10,13,10,19,1,15,16,8,9,9,7,1,16,10,16,3,10,10,10,10,2,15,13,2,9,9,11,12,7,8,12,9,8,10,15,19,12,13,2,15,10,10,17,4,12,17,13,10,4,10,2,13,10,13,15,12,7,2,16,12,10,19,9,12,6,19,15,18,0,10,10,10,2,15,6,16,2,6,7,10,2,15,2,2,5,3,13,9,2,10,10,10,1,15,10,8,15,15,10,3,13,15,10,2,10,0,9,15,19,10,13,15,19,10,15,10,1,0,8,8,10,3,2,17,4,11,4,4,11,11,11,4,4,17,11,4,11,4,11,11,11,4,4,8,17,5,11,4,4,11,5,13,4,17,4,17,4,4,17,4,14,17,17,17,4,13,14,17,4,17,4,4,17,17,4,4,4,4,4,11,4,4,2,2,13,17,13,2,13,15,17,4,13,2,14,11,17,4,13,17,13,10,17,3,17,18,10,10,17,10,8,10,12,8,8,10,19,6,15,2,6,18,9,8,19,9,15,18,10,9,19,12,19,14,17,14,7,12,14,7,12,12,12,14,12,14,7,12,7,14,12,9,13,7,5,7,9,7,7,12,6,11,5,16,13,10,13,6,7,19,2,8,8,16,19,9,6,6,2,8,8,19,8,6,10,1,1,1,10,2,12,4,17,13,17,17,13,13,14,5,4,8,13,11,4,5,11,11,4,14,11,17,4,2,7,5,5,7,5,1,11,1,4,5,13,4,9,16,10,11,4,18,17,10,17,16,4,8,8,15,8,3,2,13,3,2,1,2,9,2,11,11,9,5,0,16,0,4,4,10,10,7,5,9,10,10,1,2,3,18,2,3,14,6,6,10,4,2,2,13,13,19,19,19,1,14,13,4,17,10,10,10,10,12,18,2,10,6,4,6,10,6,8,6,4,11,2,2,2,1,8,4,11,6,6,4,6,6,8,11,14,0,5,5,5,11,5,11,17,11,11,19,5,11,11,4,5,4,4,5,5,16,8,4,6,11,5,11,5,6,4,11,4,4,11,7,11,11,2,11,11,11,8,4,4,8,5,18,5,6,11,6,5,11,4,1,11,4,11,14,4,18,11,7,5,5,4,14,11,11,4,5,5,11,1,4,5,1,11,8,9,16,11,6,5,6,8,4,11,4,4,5,14,11,11,4,6,5,11,11,11,4,4,7,1,11,4,14,5,4,11,4,8,5,2,2,2,10,2,10,19,10,10,2,15,8,16,11,18,18,18,18,0,0,18,18,0,18,0,11,14,18,18,18,12,16,14,18,0,1,4,11,14,16,4,10,14,8,19,12,6,19,16,1,1,12,10,8,17,12,2,8,8,12,8,2,16,16,16,16,5,16,16,16,16,9,7,9,19,10,13,16,3,5,16,0,12,16,6,5,5,11,5,8,15,8,19,6,16,15,8,7,2,16,16,7,19,6,2,2,6,2,6,2,15,17,10,10,9,10,10,12,15,19,2,9,10,9,19,10,10,8,13,2,9,15,19,7,8,2,10,12,8,8,8,1,13,19,0,12,8,2,15,12,8,9,12,9,0,15,6,8,1,2,19,18,15,10,8,15,12,8,7,2,17,16,6,14,8,6,8,7,10,12,9,15,15,19,17,2,10,15,10,15,8,16,5,8,10,12,8,17,19,8,8,10,13,2,18,1,13,9,2,5,9,2,10,15,10,15,1,10,12,16,10,8,8,13,10,2,13,8,14,6,10,10,0,12,6,2,16,2,2,2,13,2,12,15,16,10,8,10,9,9,3,19,15,10,16,2,2,15,12,2,2,7,2,2,2,16,10,2,6,0,8,10,5,15,15,10,5,2,16,16,12,1,16,16,15,12,6,15,2,16,19,15,8,19,15,16,15,19,19,17,16,15,5,6,8,2,7,9,16,5,2,9,9,19,13,0,2,1,19,19,10,12,3,8,15,6,17,1,10,8,10,1,15,5,19,6,9,16,15,19,6,8,8,13,10,5,15,0,2,10,6,10,6,19,2,5,10,12,2,19,8,2,8,2,1,1,11,2,16,8,1,15,6,19,5,6,2,13,19,9,12,12,16,10,10,1,6,2,8,8,19,8,13,6,12,15,5,6,7,12,16,10,15,1,2,9,10,2,19,9,10,6,10,1,16,2,10,2,19,6,8,1,6,15,5,8,8,6,15,17,17,15,17,10,19,10,19,9,15,5,16,13,15,10,9,12,9,2,18,9,8,2,12,15,0,19,3,15,15,8,9,9,2,18,12,10,8,18,2,7,14,16,10,14,10,19,5,19,18,10,8,18,0,8,16,2,8,10,10,8,19,9,10,9,18,11,19,1,12,10,2,2,1,2,2,13,2,10,8,10,18,10,13,12,15,10,19,15,15,8,10,2,10,16,10,9,1,9,6,15,7,15,6,19,10,10,2,1,2,2,12,10,10,10,3,2,15,12,8,2,19,2,2,10,0,12,2,2,9,15,10,19,15,10,13,19,10,6,16,10,10,10,2,15,16,1,1,1,15,15,12,8,13,10,9,2,9,12,8,8,2,10,2,10,2,2,8,10,2,9,3,16,2,12,1,4,6,10,6,16,16,15,9,1,2,5,14,17,17,4,17,17,13,4,17,17,17,17,17,17,17,4,17,17,17,17,4,4,4,11,4,4,4,4,11,11,5,11,11,11,11,4,11,5,11,4,4,11,6,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,18,18,0,0,0,5,0,0,11,0,0,5,16,16,5,9,7,16,15,13,14,4,5,16,14,12,14,12,12,7,11,11,16,6,1,6,6,0,12,5,8,10,15,15,12,14,15,12,15,12,10,12,15,12,14,19,15,9,15,12,2,12,12,12,9,5,15,10,15,16,12,9,10,2,10,11,11,17,11,17,4,4,6,9,2,17,17,11,12,7,2,7,7,9,12,15,7,15,7,7,12,12,12,12,12,16,12,7,14,17,14,5,2,14,15,8,16,12,0,5,7,11,7,1,18,1,5,14,14,14,0,0,0,0,12,7,9,16,10,2,10,4,17,4,7,7,14,14,7,14,9,16,7,12,9,7,7,7,19,0,0,11,19,15,9,19,6,10,2,10,6,11,7,12,14,2,6,19,16,8,12,15,8,10,17,5,13,4,4,4,17,17,17,17,4,4,17,4,17,19,2,19,13,10,16,5,12,15,10,10,10,13,2,4,4,17,2,4,13,17,13,17,17,17,10,13,13,17,10,10,13,7,13,8,13,15,10,10,17,0,4,10,1,19,3,10,3,19,2,17,17,14,14,13,11,10,7,2,11,1,9,8,10,9,9,10,19,18,15,10,14,15,12,14,9,19,12,12,8,5,9,4,8,8,16,10,2,8,3,16,0,8,10,6,8,2,17,15,14,14,11,4,5,15,2,7,8,12,9,15,13,11,7,5,4,10,2,15,15,10,10,8,14,11,11,8,0,9,13,6,2,9,5,9,2,1,19,2,0,0,1,2,6,10,6,9,9,1,8,0,15,9,8,10,10,14,17,14,19,6,4,15,17,11,16,12,5,8,8,10,1,8,4,17,14,0,19,11,14,0,1,14,7,9,9,7,7,16,0,7,9,9,12,12,12,6,12,15,9,10,0,6,1,15,16,9,1,5,8,11,12,15,14,16,16,9,16,5,5,9,16,16,9,9,7,14,8,7,8,6,8,7,16,9,7,7,8,2,1,9,9,14,9,12,1,2,8,8,8,10,6,8,2,8,8,13,19,19,9,0,15,16,0,10,12,14,5,0,9,12,0,0,18,18,7,10,18,10,14,7,7,5,7,9,9,1,14,14,15,7,8,10,2,2,2,3,12,13,10,17,8,16,15,2,2,2,18,9,19,15,2,12,1,1,10,12,9,15,2,19,15,10,17,2,0,0,18,18,0,7,18,14,15,15,19,14,0,18,14,16,5,14,14,14,14,18,14,18,15,10,7,19,9,1,10,2,13,15,2,0,19,14,14,15,14,18,15,10,10,12,15,19,19,10,10,9,4,15,9,19,10,8,5,9,2,12,8,10,8,10,2,1,9,15,6,3,2,2,19,6,8,8,10,10,2,8,2,19,15,9,1,6,10,10,19,0,9,19,2,10,2,2,6,4,14,18,9,16,16,2,10,2,8,12,10,2,19,2,10,16,15,2,13,10,14,18,18,18,18,18,6,5,16,16,9,14,18,18,18,14,18,18,18,18,18,18,18,18,5,10,8,17,18,14,18,14,18,18,18,0,17,17,7,9,10,2,2,19,8,10,10,19,2,8,7,10,10,6,12,9,2,6,2,1,6,9,10,8,2,19,16,1,9,6,10,2,10,1,9,2,12,1,2,1,9,15,8,5,7,9,9,12,2,9,16,12,8,17,2,11,4,4,17,17,19,14,16,14,15,8,11,18,19,3,2,2,14,5,14,19,8,4,8,14,8,18,8,15,8,2,6,10,9,5,16,14,10,2,8,2,8,1,15,19,10,2,10,10,10,19,11,18,18,18,13,18,18,12,0,11,0,0,5,7,0,7,7,9,5,12,12,15,8,9,9,5,12,19,15,9,15,2,13,2,2,13,9,19,19,15,4,17,13,9,10,2,10,9,19,15,19,13,10,9,10,2,19,19,2,2,10,19,9,2,2,8,10,12,17,17,8,2,14,4,18,10,18,2,11,5,4,12,19,19,11,2,2,4,0,0,10,4,11,11,2,17,6,15,14,18,19,15,9,15,6,15,8,10,10,15,12,5,8,2,2,2,10,17,2,0,17,0,0,0,0,13,2,19,15,6,10,3,13,6,10,10,5,5,11,5,4,17,14,14,2,2,17,11,17,19,4,10,10,4,13,10,17,17,17,2,17,17,4,7,8,11,8,4,11,11,11,7,14,6,9,13,15,14,2,10,1,10,19,10,2,2,15,6,17,5,17,17,4,2,4,0,15,6,15,15,15,2,10,18,17,10,5,16,5,4,16,16,16,18,14,2,0,8,5,2,10,6,10,2,10,15,9,1,6,2,2,2,4,10,18,15,14,0,0,0,11,16,18,4,14,0,10,10,10,4,14,17,15,14,16,9,4,14,11,17,5,7,19,19,15,1,6,2,2,12,17,10,8,1,19,15,8,10,10,18,0,1,4,14,19,15,10,14,4,11,17,14,1,11,17,14,17,17,2,8,8,2,1,16,6,10,10,17,7,4,4,4,17,10,4,8,4,17,17,11,4,1,1,14,8,11,1,2,16,5,17,17,4,8,9,1,8,10,2,2,10,12,13,17,10,10,8,13,8,8,6,19,10,17,2,17,4,10,2,8,13,10,8,19,17,17,6,12,1,1,10,2,10,4,14,13,19,12,12,2,12,1,2,2,6,10,19,10,4,11,14,18,17,10,9,1,5,1,15,9,2,15,6,8,2,13,8,6,10,2,19,19,19,2,15,4,2,4,14,4,17,4,4,10,14,14,10,19,13,9,10,5,8,3,12,8,13,10,17,8,9,3,2,17,2,2,15,13,15,10,13,17,17,10,10,17,19,10,10,19,2,2,2,10,7,4,10,1,7,12,9,9,4,1,5,7,5,16,16,7,9,6,1,6,7,7,19,8,1,12,9,12,1,6,9,16,19,10,10,4,4,10,2,10,2,17,19,15,15,10,2,10,10,13,10,10,10,4,10,6,15,9,9,2,10,17,10,13,4,11,13,17,17,3,15,13,10,19,6,2,2,2,3,3,10,19,15,15,8,6,2,4,8,2,13,4,10,2,13,2,18,18,17,4,8,10,2,10,4,6,2,17,18,10,10,9,19,10,2,1,11,11,5,11,11,17,4,14,17,17,10,8,17,17,10,13,17,10,13,13,17,13,10,10,10,17,9,1,9,7,15,15,12,12,15,9,19,7,7,7,15,15,19,2,19,19,17,17,3,1,19,10,2,14,13,17,10,17,17,5,13,14,5,6,6,11,8,6,17,15,6,14,19,19,19,15,4,12,10,16,12,12,16,19,10,9,4,2,17,5,8,7,11,19,15,15,2,2,8,10,0,10,14,4,10,10,8,12,4,4,4,17,5,10,2,2,17,1,12,7,16,6,1,10,15,19,10,10,17,18,16,18,15,13,3,14,2,8,19,4,5,6,14,19,17,11,14,14,11,14,2,9,15,6,19,2,8,17,13,8,14,15,6,13,2,7,7,5,16,6,14,10,2,13,10,19,6,6,2,2,2,10,13,13,11,19,4,4,8,4,4,14,11,5,15,2,9,6,19,10,17,6,15,2,10,15,15,6,2,2,10,10,8,18,18,18,10,14,8,5,2,2,2,2,6,19,10,2,18,17,19,19,7,10,19,15,10,15,10,8,17,17,10,2,10,17,17,17,10,5,10,2,10,2,17,7,19,15,19,6,19,8,10,17,16,9,10,2,17,17,4,6,10,17,14,17,14,10,2,16,14,13,13,13,17,13,2,15,10,10,14,17,10,10,17,14,17,10,10,18,18,0,18,18,18,18,18,10,15,4,12,13,5,12,15,15,4,4,2,17,2,2,14,14,11,11,11,16,3,7,12,12,15,18,0,18,18,18,16,16,5,4,16,16,16,16,9,11

    ]

)
Ejemplo n.º 36
0
def Euc_to_fst(vector_lib,
               n_comp=5,
               pop_max=8,
               Iter=20,
               bias_range=[20, 300],
               Eigen=False,
               Scale=False,
               Centre=True):
    ### Select pre and post processing measures.

    length_haps = vector_lib.shape[1]

    print('length haps: {}, N iterations: {}, range pops: {}'.format(
        length_haps, Iter, pop_max))

    #### Predict
    predicted = []

    #def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs):
    lengths_vector = []

    ### store distances between centroids
    biased_pairwise = []

    ### store PC projection:
    dist_PC_corrected = {x: [] for x in range(n_comp)}

    ### store fsts
    fst_store = []

    ### proceed.

    for rep in range(Iter):

        N_pops = np.random.choice(range(3, pop_max), 1, replace=False)[0]

        ## Population Sizes and labels
        bias_scheme = np.random.choice(range(bias_range[0], bias_range[1]),
                                       N_pops,
                                       replace=False)

        bias_labels = np.repeat(np.array([x for x in range(N_pops)]),
                                bias_scheme)

        ### triangular matrices extract.
        iu1 = np.triu_indices(N_pops, 1)  # for centroid comparison

        iu_bias = np.triu_indices(sum(bias_scheme), 1)

        iu_control = np.triu_indices(2, 1)

        Pops = np.random.choice(vector_lib.shape[0], N_pops, replace=False)
        #print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps))
        ########## FST

        freqs_selected = vector_lib[Pops, :length_haps]
        Pairwise = Ste.return_fsts2(freqs_selected)

        #fsts_compare = scale(Pairwise.fst)
        fsts_compare = Pairwise.fst

        fst_store.extend(fsts_compare)

        ## lengths
        lengths_vector.extend([length_haps] * len(fsts_compare))

        #### generate data and perform PCA
        data = []

        for k in range(N_pops):

            probs = vector_lib[Pops[k], :]

            m = bias_scheme[k]
            Haps = [[
                np.random.choice([1, 0], p=[1 - probs[x], probs[x]])
                for x in range(length_haps)
            ] for acc in range(m)]

            data.extend(Haps)

        data2 = np.array(data)

        if Scale:
            data2 = scale(data2)

        pca = PCA(n_components=n_comp, whiten=False,
                  svd_solver='randomized').fit(data2)

        feat_bias = pca.transform(data2)

        if Eigen:
            feat_bias = feat_bias * pca.explained_variance_ratio_

        #### Centroid distances

        bias_centroids = [
            np.mean(feat_bias[
                [y for y in range(feat_bias.shape[0])
                 if bias_labels[y] == z], :],
                    axis=0) for z in range(N_pops)
        ]
        bias_centroids = np.array(bias_centroids)

        bias_pair_dist = pairwise_distances(bias_centroids, metric='euclidean')
        bias_pair_dist = bias_pair_dist[iu1]
        #bias_pair_dist= scale(bias_pair_dist)

        biased_pairwise.extend(bias_pair_dist)

    Size = length_haps
    fst_lm_range = [0, .3]

    Lindexes = [
        x for x in range(len(lengths_vector)) if lengths_vector[x] == Size
        and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1]
    ]
    y_true = [np.log(biased_pairwise[x]) for x in Lindexes]
    fst_x = [np.log(fst_store[x]) for x in Lindexes]
    m_coeff, b = np.polyfit(y_true, fst_x, 1)

    return m_coeff, b, fst_x, y_true
Ejemplo n.º 37
0
def eval_topk(train_file, test_file, k, batch_size, dist_function, gamma,
              threads):

    train_data = np.load(train_file)
    test_data = np.load(test_file)

    train_z = train_data['z']
    train_labels = train_data['labels']
    n_train, dz = train_z.shape
    print(train_z.shape)

    test_z = test_data['z']
    test_labels = test_data['labels']
    test_pred_probs = test_data['pred_probs']
    test_confs = test_data['confs']
    print(test_confs.shape)

    n_test, n_classes = test_pred_probs.shape

    # scatter the labels
    if len(train_labels.shape) == 1 or train_labels.shape[1] == 1:
        temp = np.zeros((n_train, n_classes), dtype=int)
        temp[np.arange(n_train), train_labels] = 1
        train_labels = temp

    if len(test_labels.shape) == 1 or test_labels.shape[1] == 1:
        temp = np.zeros((n_test, n_classes), dtype=int)
        temp[np.arange(n_test), test_labels] = 1
        test_labels = temp

    sparsity = []
    correct = 0
    agreed = 0
    total_points = 0
    mae = 0.0
    n_batches = int(np.ceil(n_test / batch_size))
    for b in range(n_batches):
        if b < n_batches - 1:
            indices = np.arange(b * batch_size, (b + 1) * batch_size)
        else:
            indices = np.arange(b * batch_size, n_test)
        batch_size_b = len(indices)
        test_points = test_z[indices, :]

        if dist_function == 'Gauss':
            dists = pairwise_distances(test_points,
                                       train_z,
                                       metric='sqeuclidean',
                                       n_jobs=threads)
            dists = np.exp(-gamma * dists)
        elif dist_function == 'Laplace':
            dists = pairwise_distances(test_points,
                                       train_z,
                                       metric='l1',
                                       n_jobs=threads)
            dists = 0.5 * np.exp(-1.0 * dists)
        elif dist_function == 'InverseQuad':
            dists = pairwise_distances(test_points,
                                       train_z,
                                       metric='sqeuclidean',
                                       n_jobs=threads)
            dists = 1.0 / (dists + gamma)
        else:
            raise ValueError("Distance function not recognized.")

        # sort each row by weight (smallest first)
        order = np.argsort(dists, axis=1)

        # trying alternate masking
        mask = np.zeros_like(dists)
        for j in range(len(indices)):
            mask[np.ones(k, dtype=int) * j, order[j, -k:]] = 1

        # hopefully, we get exactly k points per row
        print("{:d}/{:d}".format(b, n_batches), np.min(mask.sum(1)),
              np.max(mask.sum(1)))

        # compute the weighted sums per class using only these points
        class_dists = np.dot(mask * dists, train_labels)
        class_dist_sums = class_dists.sum(1)
        topk_probs = class_dists / class_dist_sums.reshape((batch_size_b, 1))

        # measure accuracy
        correct += np.sum(
            class_dists.argmax(1) == test_labels[indices, :].argmax(1))
        # also measure agreement with predicted labels
        agreed += np.sum(
            class_dists.argmax(1) == test_pred_probs[indices, :].argmax(1))
        total_points += batch_size_b
        test_preds = test_pred_probs[indices, :].argmax(1)
        mae += np.sum(
            np.abs(topk_probs[np.arange(batch_size_b), test_preds] -
                   test_pred_probs[indices, test_preds])) / float(batch_size_b)

    print(total_points, n_test)
    acc = correct / float(total_points)
    print(acc)
    agreement = agreed / float(total_points)
    print(agreement)
    return acc, agreement
Ejemplo n.º 38
0
def reliefF(X, y, mode="rank", **kwargs):
    """
    This function implements the reliefF feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    kwargs: {dictionary}
        parameters of reliefF:
        k: {int}
            choices for the number of neighbors (default k = 5)

    Output
    ------
    score: {numpy array}, shape (n_features,)
        reliefF score for each feature

    Reference
    ---------
    Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003.
    Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013.
    """
    def feature_ranking(score):
        """
        Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the
        feature is
        """
        idx = np.argsort(score, 0)
        return idx[::-1]

    if "k" not in list(kwargs.keys()):
        k = 5
    else:
        k = kwargs["k"]
    n_samples, n_features = X.shape

    # calculate pairwise distances between instances
    distance = pairwise_distances(X, metric='manhattan')

    score = np.zeros(n_features)

    # the number of sampled instances is equal to the number of total instances
    for idx in range(n_samples):
        near_hit = []
        near_miss = dict()

        self_fea = X[idx, :]
        c = np.unique(y).tolist()

        stop_dict = dict()
        for label in c:
            stop_dict[label] = 0
        del c[c.index(y[idx])]

        p_dict = dict()
        p_label_idx = float(len(y[y == y[idx]])) / float(n_samples)

        for label in c:
            p_label_c = float(len(y[y == label])) / float(n_samples)
            p_dict[label] = p_label_c / (1 - p_label_idx)
            near_miss[label] = []

        distance_sort = []
        distance[idx, idx] = np.max(distance[idx, :])

        for i in range(n_samples):
            distance_sort.append([distance[idx, i], int(i), y[i]])
        distance_sort.sort(key=lambda x: x[0])

        for i in range(n_samples):
            # find k nearest hit points
            if distance_sort[i][2] == y[idx]:
                if len(near_hit) < k:
                    near_hit.append(distance_sort[i][1])
                elif len(near_hit) == k:
                    stop_dict[y[idx]] = 1
            else:
                # find k nearest miss points for each label
                if len(near_miss[distance_sort[i][2]]) < k:
                    near_miss[distance_sort[i][2]].append(distance_sort[i][1])
                else:
                    if len(near_miss[distance_sort[i][2]]) == k:
                        stop_dict[distance_sort[i][2]] = 1
            stop = True
            for (key, value) in list(stop_dict.items()):
                if value != 1:
                    stop = False
            if stop:
                break

        # update reliefF score
        near_hit_term = np.zeros(n_features)
        for ele in near_hit:
            near_hit_term = np.array(
                abs(self_fea - X[ele, :])) + np.array(near_hit_term)

        near_miss_term = dict()
        for (label, miss_list) in list(near_miss.items()):
            near_miss_term[label] = np.zeros(n_features)
            for ele in miss_list:
                near_miss_term[label] = np.array(
                    abs(self_fea - X[ele, :])) + np.array(
                        near_miss_term[label])
            score += near_miss_term[label] / (k * p_dict[label])
        score -= near_hit_term / k
    if mode == 'raw':
        return score
    elif mode == 'index':
        return feature_ranking(score)
    elif mode == 'rank':
        return reverse_argsort(feature_ranking(score), X.shape[1])
def PearsonCorrelation(UserItemMatrix):
    similarity = 1 - pairwise_distances(UserItemMatrix, metric='correlation')

    similarity[np.isnan(similarity)] = 0
    return similarity
Ejemplo n.º 40
0
appendix_col = np.zeros(
    (R_train.shape[0], unique_movie_num - R_train.shape[1]))
R_train = np.concatenate((R_train, appendix_col), axis=1)

# Computes the R matrix on testing data
R_test = test_df.pivot(index='userId', columns='movieId',
                       values='rating').fillna(0).values
appendix_row = np.zeros((unique_user_num - R_test.shape[0], R_test.shape[1]))
R_test = np.concatenate((R_test, appendix_row), axis=0)
appendix_col = np.zeros((R_test.shape[0], unique_movie_num - R_test.shape[1]))
R_test = np.concatenate((R_test, appendix_col), axis=1)

########### user-based collaborative filtering ################################

# Computes user-based similarity matrix
S_user = pairwise_distances(R_train, metric='euclidean')

# Training set prediction
#mean_user_rating = R_train.mean(axis = 1)
mean_user_rating = np.zeros(R_train.shape[0])
for i in range(R_train.shape[0]):
    mean_user_rating[i] = R_train[i].mean()
mean_user_rating_train = mean_user_rating[:, np.newaxis]
difference_train = R_train - mean_user_rating_train
numerator_train_ub = np.dot(S_user, difference_train)
denominator_train_ub = np.abs(S_user).sum(axis=1)[:, np.newaxis]
prediction_train_ub = mean_user_rating[:, np.newaxis] + numerator_train_ub\
                        / denominator_train_ub

# Computes root mean squate error of training set
prediction_specified_train_ub = prediction_train_ub[R_train.nonzero()]
def printPrediction(model, smilesData):
    # FIXME hardcoded

    smilesDf = pd.DataFrame(smilesData, columns=[cc.exp['params']['data']['smiles']])

    input = data.formatSequentialInput(smilesDf)

    output = model.predict(input)

    for i, smiles in enumerate(smilesData):
        print 'Prediction for {}'.format(smiles)
        print output[i]

    distanceMatrixCosine = pairwise_distances(output, metric='cosine')
    distanceMatrixCorrel = pairwise_distances(output, metric='correlation')
    distanceMatrixEuclid = pairwise_distances(output, metric='euclidean')

    print 'Distance matrix cosine'
    print distanceMatrixCosine
    print 'Distance matrix correlation'
    print distanceMatrixCorrel
    print 'Distance matrix euclid'
    print distanceMatrixEuclid

    '''

    layerIdx = 1
    cfg = model.get_config()[:layerIdx+1]
    cfg[0]['config']['dropout_U'] = 0
    cfg[0]['config']['dropout_W'] = 0

    print cfg[0]
    print cfg[1]
    # del cfg[1]
    # layerIdx -= 1
    # print cfg
    cfg[layerIdx]['config']['return_sequences'] = True
    '''


    layerIdx = 2
    cfg = model.get_config()[:layerIdx+1]
    del cfg[1]
    layerIdx -= 1
    # print cfg
    cfg[layerIdx]['config']['return_sequences'] = True

    seqModel = Sequential.from_config(cfg)
    seqModel.set_weights(model.get_weights())
    seqModel.layers[layerIdx].return_sequences = True


    outputFunction = K.function([seqModel.layers[0].input],
              [seqModel.layers[layerIdx].output])

    outputSymbols = outputFunction([input])[0]

    outputLastSymbol = outputSymbols[:,outputSymbols.shape[1]-1,:]

    distanceMatrixLastSymbolCorrel = np.corrcoef(outputLastSymbol)

    print 'Distance matrix last symbol correlation'
    print distanceMatrixLastSymbolCorrel
Ejemplo n.º 42
0
def test_knn_imputer_weight_distance(na):
    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])

    # Test with "distance" weight
    nn = KNeighborsRegressor(metric="euclidean", weights="distance")
    X_rows_idx = [0, 2, 3, 4, 5, 6]
    nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
    knn_imputed_value = nn.predict(X[1:2, 1:])[0]

    # Manual calculation
    X_neighbors_idx = [0, 2, 3, 4, 5]
    dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
    weights = 1 / dist[:, X_neighbors_idx].ravel()
    manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)

    X_imputed_distance1 = np.array(
        [[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
    )

    # NearestNeighbor calculation
    X_imputed_distance2 = np.array(
        [[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
    )

    imputer = KNNImputer(weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
    assert_allclose(imputer.fit_transform(X), X_imputed_distance2)

    # Test with weights = "distance" and n_neighbors=2
    X = np.array(
        [
            [na, 0, 0],
            [2, 1, 2],
            [3, 2, 3],
            [4, 5, 5],
        ]
    )

    # neighbors are rows 1, 2, the nan_euclidean_distances are:
    dist_0_1 = np.sqrt((3 / 2) * ((1 - 0) ** 2 + (2 - 0) ** 2))
    dist_0_2 = np.sqrt((3 / 2) * ((2 - 0) ** 2 + (3 - 0) ** 2))
    imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])

    X_imputed = np.array(
        [
            [imputed_value, 0, 0],
            [2, 1, 2],
            [3, 2, 3],
            [4, 5, 5],
        ]
    )

    imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    # Test with varying missingness patterns
    X = np.array(
        [
            [1, 0, 0, 1],
            [0, na, 1, na],
            [1, 1, 1, na],
            [0, 1, 0, 0],
            [0, 0, 0, 0],
            [1, 0, 1, 1],
            [10, 10, 10, 10],
        ]
    )

    # Get weights of donor neighbors
    dist = nan_euclidean_distances(X, missing_values=na)
    r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
    r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
    r1c1_nbor_wt = 1 / r1c1_nbor_dists
    r1c3_nbor_wt = 1 / r1c3_nbor_dists

    r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
    r2c3_nbor_wt = 1 / r2c3_nbor_dists

    # Collect donor values
    col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
    col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()

    # Final imputed values
    r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
    r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
    r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)

    X_imputed = np.array(
        [
            [1, 0, 0, 1],
            [0, r1c1_imp, 1, r1c3_imp],
            [1, 1, 1, r2c3_imp],
            [0, 1, 0, 0],
            [0, 0, 0, 0],
            [1, 0, 1, 1],
            [10, 10, 10, 10],
        ]
    )

    imputer = KNNImputer(weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    X = np.array(
        [
            [0, 0, 0, na],
            [1, 1, 1, na],
            [2, 2, na, 2],
            [3, 3, 3, 3],
            [4, 4, 4, 4],
            [5, 5, 5, 5],
            [6, 6, 6, 6],
            [na, 7, 7, 7],
        ]
    )

    dist = pairwise_distances(
        X, metric="nan_euclidean", squared=False, missing_values=na
    )

    # Calculate weights
    r0c3_w = 1.0 / dist[0, 2:-1]
    r1c3_w = 1.0 / dist[1, 2:-1]
    r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
    r7c0_w = 1.0 / dist[7, 2:7]

    # Calculate weighted averages
    r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
    r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
    r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
    r7c0 = np.average(X[2:7, 0], weights=r7c0_w)

    X_imputed = np.array(
        [
            [0, 0, 0, r0c3],
            [1, 1, 1, r1c3],
            [2, 2, r2c2, 2],
            [3, 3, 3, 3],
            [4, 4, 4, 4],
            [5, 5, 5, 5],
            [6, 6, 6, 6],
            [r7c0, 7, 7, 7],
        ]
    )

    imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
    assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
def train(ocsvm, X_train, X_test, Y_test, kernel, nu, GridSearch=True, **kwargs):

    if X_train.ndim > 2:
        X_train_shape = X_train.shape
        X_train = X_train.reshape(X_train_shape[0], np.prod(X_train_shape[1:]))
    else:
        X_train = X_train

    if kernel in ('DegreeKernel', 'WeightedDegreeKernel'):
        # get_kernel_matrix(kernel=kernel, X_train=X_train, **kwargs)
        # svm.fit(K_train)
        print('unexpected behaviour')
    else:
        if GridSearch and kernel == 'rbf':

            # use grid search cross-validation to select gamma
            print("Using GridSearchCV for hyperparameter selection...")

            # sample small hold-out set from test set for hyperparameter selection. Save as val set.
            
            n_test_set = len(X_test)
            n_val_set = int(0.1 * n_test_set)
            n_test_out = 0
            n_test_norm = 0
            n_val_out = 0
            n_val_norm = 0
            while (n_test_out == 0) | (n_test_norm == 0) | (n_val_out == 0) | (n_val_norm ==0):
                perm = np.random.permutation(n_test_set)
                X_val = X_test[perm[:n_val_set]]
                y_val = Y_test[perm[:n_val_set]]
                # only accept small test set if AUC can be computed on val and test set
                n_test_out = np.sum(Y_test[perm[:n_val_set]])
                n_test_norm = np.sum(Y_test[perm[:n_val_set]] == 0)
                n_val_out = np.sum(Y_test[perm[n_val_set:]])
                n_val_norm = np.sum(Y_test[perm[n_val_set:]] == 0)

            X_test = X_test[perm[n_val_set:]]
            Y_test = Y_test[perm[n_val_set:]]
            n_val = len(y_val)
            n_test_set = len(Y_test)

            val_scores = np.zeros((len(y_val), 1))
            test_scores = np.zeros((len(Y_test), 1))

            cv_auc = 0.0
            cv_acc = 0
            cv_f1 = 0

            g_best = 0.1
            for gamma in np.logspace(-10, -1, num=10, base=2):

                # train on selected gamma
                cv_svm = svm.OneClassSVM(kernel='rbf', nu=nu, gamma=gamma)
                cv_svm.fit(X_train)

                # predict on small hold-out set
                val_acc, _, _, _, val_f1_score, val_auc_roc = predict(cv_svm, X_val, y_val, kernel)

                # save model if AUC on hold-out set improved
                if val_f1_score > cv_f1:
 #                   print('gamma set to: ', g_best)
                    ocsvm = cv_svm
                    g_best = gamma
                    cv_auc = val_auc_roc
                    cv_f1 = val_f1_score

            # save results of best cv run
            # diag['val']['auc'] = cv_auc
            # diag['val']['acc'] = cv_acc

            oc_svm = svm.OneClassSVM(kernel='rbf', nu=nu, gamma=g_best)
 

            ocsvm.fit(X_train)


        else:
            # if rbf-kernel, re-initialize svm with gamma minimizing the
            # numerical error
            if kernel == 'rbf':
                gamma = 1 / (np.max(pairwise_distances(X_train)) ** 2)
                # ocsvm = svm.OneClassSVM(kernel='rbf', nu=nu, gamma=gamma)

            ocsvm.fit(X_train)
            gamma = gamma

    return ocsvm
Ejemplo n.º 44
0
def test_agglomerative_clustering():
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(n_clusters=10,
                                                 connectivity=connectivity,
                                                 memory=tempdir,
                                                 linkage=linkage)
            clustering.fit(X)
            labels = clustering.labels_
            assert_true(np.size(np.unique(labels)) == 10)
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering.labels_, labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert_true(np.size(np.unique(clustering.labels_)) == 10)
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
            linkage=linkage)
        assert_raises(ValueError, clustering.fit, X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity.toarray(),
                                         affinity="manhattan",
                                         linkage="ward")
    assert_raises(ValueError, clustering.fit, X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=np.ones(
                                                 (n_samples, n_samples)),
                                             affinity=affinity,
                                             linkage="complete")
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(n_clusters=10,
                                              connectivity=None,
                                              affinity=affinity,
                                              linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering2.labels_,
                                         clustering.labels_), 1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(n_clusters=10,
                                          connectivity=connectivity,
                                          affinity='precomputed',
                                          linkage="complete")
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)
Ejemplo n.º 45
0
def run_cluster(compl, qfib, qsym, cfg):
    """
    """
    cl_radius = cfg.find_orientations.clustering.radius
    min_compl = cfg.find_orientations.clustering.completeness
    algorithm = cfg.find_orientations.clustering.algorithm

    start = time.clock()  # time this

    num_above = sum(np.array(compl) > min_compl)
    if num_above == 0:
        # nothing to cluster
        qbar = cl = np.array([])
    elif num_above == 1:
        # short circuit
        qbar = qfib[:, np.array(compl) > min_compl]
        cl = [1]
    else:
        # use compiled module for distance
        # just to be safe, must order qsym as C-contiguous
        qsym = np.array(qsym.T, order='C').T
        quat_distance = lambda x, y: xfcapi.quat_distance(
            np.array(x, order='C'), np.array(y, order='C'), qsym)

        qfib_r = qfib[:, np.array(compl) > min_compl]

        logger.info("Feeding %d orientations above %.1f%% to clustering",
                    qfib_r.shape[1], 100 * min_compl)

        if algorithm == 'dbscan' and not have_sklearn:
            algorithm = 'fclusterdata'
            logger.warning(
                "sklearn >= 0.14 required for dbscan, using fclusterdata")
        if algorithm == 'dbscan':
            pdist = pairwise_distances(qfib_r.T,
                                       metric=quat_distance,
                                       n_jobs=-1)
            core_samples, labels = dbscan(pdist,
                                          eps=np.radians(cl_radius),
                                          min_samples=1,
                                          metric='precomputed')
            cl = np.array(labels, dtype=int) + 1
        elif algorithm == 'fclusterdata':
            cl = cluster.hierarchy.fclusterdata(qfib_r.T,
                                                np.radians(cl_radius),
                                                criterion='distance',
                                                metric=quat_distance)
        else:
            raise RuntimeError("Clustering algorithm %s not recognized" %
                               algorithm)

        nblobs = len(np.unique(cl))

        qbar = np.zeros((4, nblobs))
        for i in range(nblobs):
            npts = sum(cl == i + 1)
            qbar[:,
                 i] = rot.quatAverage(qfib_r[:, cl == i + 1].reshape(4, npts),
                                      qsym).flatten()

    logger.info("clustering took %f seconds", time.clock() - start)
    logger.info(
        "Found %d orientation clusters with >=%.1f%% completeness"
        " and %2f misorientation", qbar.size / 4, 100. * min_compl, cl_radius)

    return np.atleast_2d(qbar), cl
Ejemplo n.º 46
0
# with my understanding, the correct way is as following 
# u*sqrt(s) as the user-feature-array
# sqrt(s)*vt as the item-feature-array
# user_similarity = pairwise_distances(u*sqrt(s), metric='cosine')
# item_similarity = pairwise_distances(sqrt(s)*vt, metric='cosine')
# if one-never-exit user/item wanted tobe predicted 
#    A = USV^ ====> u' = new * V^(-1) * S^(-1)
#    then compute the similarity between u' among other users
#    as same as item

user_train_matrix = u.dot(np.sqrt(s_diag_matrix))
item_train_matrix = np.sqrt(s_diag_matrix).dot(vt)
print str(user_train_matrix.shape), '\tuser_train_matrix\t', type(user_train_matrix)
print str(item_train_matrix.shape), '\titem_train_matrix\t', type(item_train_matrix)
user_similarity = pairwise_distances(user_train_matrix, metric='cosine')
item_similarity = pairwise_distances(item_train_matrix.T, metric='cosine')
print str(user_similarity.shape), '\tuser_similarity\t\t', type(user_similarity)
print str(item_similarity.shape), '\titem_similarity\t\t', type(item_similarity)


### ==== 6) predict by diff type ==== ###
def rmse(prediction, ground_truth):
	prediction   = prediction[ground_truth.nonzero()].flatten()
	ground_truth = ground_truth[ground_truth.nonzero()].flatten()
	return sqrt(mean_squared_error(prediction, ground_truth))

def predict(ratings, similarity, type='user'):
	if type == 'user':
		mean_user_rating = ratings.mean(axis=1) ## 获取打分的均值
		ratings_diff     = (ratings - mean_user_rating[:, np.newaxis]) ## np.newaxis 是用来规范维度的,可以将(6,)->(6,1)
Ejemplo n.º 47
0
def Fst_predict(vector_lib,
                m_coeff,
                b,
                n_comp=5,
                pop_max=8,
                Iter=20,
                bias_range=[20, 300],
                Eigen=False,
                Scale=False,
                Centre=True):
    ### Select pre and post processing measures.

    length_haps = vector_lib.shape[1]

    print('length haps: {}, N iterations: {}, range pops: {}'.format(
        length_haps, Iter, pop_max))

    #### Predict
    predicted = []

    #def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs):
    lengths_vector = []

    ### store distances between centroids
    biased_pairwise = []

    ### store PC projection:
    dist_PC_corrected = {x: [] for x in range(n_comp)}

    ### store fsts
    fst_store = []

    ### proceed.

    for rep in range(Iter):

        N_pops = np.random.choice(range(3, pop_max), 1, replace=False)[0]

        ## Population Sizes and labels
        bias_scheme = np.random.choice(range(bias_range[0], bias_range[1]),
                                       N_pops,
                                       replace=False)

        bias_labels = np.repeat(np.array([x for x in range(N_pops)]),
                                bias_scheme)

        ### triangular matrices extract.
        iu1 = np.triu_indices(N_pops, 1)  # for centroid comparison

        iu_bias = np.triu_indices(sum(bias_scheme), 1)

        iu_control = np.triu_indices(2, 1)

        Pops = np.random.choice(vector_lib.shape[0], N_pops, replace=False)
        #print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps))
        ########## FST

        freqs_selected = vector_lib[Pops, :length_haps]
        Pairwise = Ste.return_fsts2(freqs_selected)

        #fsts_compare = scale(Pairwise.fst)
        fsts_compare = Pairwise.fst

        fst_store.extend(fsts_compare)

        ## lengths
        lengths_vector.extend([length_haps] * len(fsts_compare))

        #### generate data and perform PCA
        data = []

        for k in range(N_pops):

            probs = vector_lib[Pops[k], :]

            m = bias_scheme[k]
            Haps = [[
                np.random.choice([1, 0], p=[1 - probs[x], probs[x]])
                for x in range(length_haps)
            ] for acc in range(m)]

            data.extend(Haps)

        data2 = np.array(data)

        if Scale:
            data2 = scale(data2)

        pca = PCA(n_components=n_comp, whiten=False,
                  svd_solver='randomized').fit(data2)

        feat_bias = pca.transform(data2)

        if Eigen:
            feat_bias = feat_bias * pca.explained_variance_ratio_

        #### Centroid distances

        bias_centroids = [
            np.mean(feat_bias[
                [y for y in range(feat_bias.shape[0])
                 if bias_labels[y] == z], :],
                    axis=0) for z in range(N_pops)
        ]
        bias_centroids = np.array(bias_centroids)

        bias_pair_dist = pairwise_distances(bias_centroids, metric='euclidean')
        bias_pair_dist = bias_pair_dist[iu1]
        #bias_pair_dist= scale(bias_pair_dist)

        fst_pred = [np.exp(m_coeff * np.log(x) + b) for x in bias_pair_dist]
        predicted.extend(fst_pred)

    fig = [go.Scatter(x=fst_store, y=predicted, mode='markers')]

    layout = go.Layout(title='test of prediction',
                       yaxis=dict(title='predicted Fst'),
                       xaxis=dict(title='observed Fst'))

    fig = go.Figure(data=fig, layout=layout)
    iplot(fig)
Ejemplo n.º 48
0
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances

header = ['user_id', 'sport_rating', 'moive_rating']
df = pd.read_csv('user.csv', names=header)
train = df.as_matrix()

user_similarity = pairwise_distances(train, metric='cosine')
# print user_similarity

item_similarity = pairwise_distances(train.T, metric='cosine')
# print item_similarity

mean_user_rating = train.mean(axis=1)
ratings_diff = (train - mean_user_rating[:, np.newaxis])
pred_user = mean_user_rating[:, np.newaxis] + user_similarity.dot(
    ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
# print pred_user

pred_item = train.dot(item_similarity) / np.array(
    [np.abs(item_similarity).sum(axis=1)])
# print pred_item

########################################################################

from boto3.dynamodb.conditions import Key, Attr
from boto3.session import Session
import os
import sys
import json
Ejemplo n.º 49
0
def load(hdf5_file_name, data, minPts, eps = None, quantile = 50, subsamples_matrix = None, samples_weights = None, 
metric = 'minkowski', p = 2, verbose = True):
    """Determines the radius 'eps' for DBSCAN clustering of 'data' in an adaptive, data-dependent way.
    Parameters
    ----------
    hdf5_file_name : file object or string
        The handle or name of an HDF5 data structure where any array needed for DBSCAN
        and too large to fit into memory is to be stored.
    data : array of shape (n_samples, n_features)
        An array of features retained from the data-set to be analysed. 
        Subsamples of this curated data-set can also be analysed by a call to DBSCAN by providing an appropriate 
        list of selected samples labels, stored in 'subsamples_matrix' (see below).
    subsamples_matrix : array of shape (n_runs, n_subsamples), optional (default = None)
        Each row of this matrix contains a set of indices identifying the samples selected from the whole data-set
        for each of 'n_runs' independent rounds of DBSCAN clusterings.
    minPts : int
        The number of points within an epsilon-radius hypershpere for the said region to qualify as dense.
    eps : float, optional (default = None)
        Sets the maximum distance separating two data-points for those data-points to be considered 
        as part of the same neighborhood.
    quantile : int, optional (default = 50)
        If 'eps' is not provided by the user, it will be determined as the 'quantile' of the distribution 
        of the k-nearest distances to each sample, with k set to 'minPts'.
    samples_weights : array of shape (n_runs, n_samples), optional (default = None)
        Holds the weights of each sample. A sample with weight greater than 'minPts' is guaranteed to be
        a core sample; a sample with negative weight tends to prevent its 'eps'-neighbors from being core. 
        Weights are absolute and default to 1.
    metric : string or callable, optional (default = 'euclidean')
        The metric to use for computing the pairwise distances between samples 
        (each sample corresponds to a row in 'data'). If metric is a string or callable, it must be compatible 
        with metrics.pairwise.pairwise_distances.
    p : float, optional (default = 2)
        If a Minkowski metric is used, 'p' determines its power.
    verbose : Boolean, optional (default = True)
        Whether to display messages reporting the status of the computations and the time it took 
        to complete each major stage of the algorithm. 
    Returns
    -------
    eps : float
        The parameter of DBSCAN clustering specifying if points are density-reachable. 
        This is either a copy of the value provided at input or, if the user did not specify a value of 'eps' at input, 
        the return value if the one determined from k-distance graphs from the data-set.
    References
    ----------
    Ester, M., H. P. Kriegel, J. Sander and X. Xu, "A Density-Based
    Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
    In: Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
    """
    
    data = np.array(data, copy = False)
    if data.ndim > 2:
        raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n" 
                         "the data array is of dimension %d. Please provide a two-dimensional "
                         "array instead.\n" % data.ndim)

    if subsamples_matrix is None:
        subsamples_matrix = np.arange(data.shape[0], dtype = int)
        subsamples_matrix = subsamples_matrix.reshape(1, -1)
 
    else:
        subsamples_matrix = np.array(subsamples_matrix, copy = False)

    if subsamples_matrix.ndim > 2:
        raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n"
                         "the array of subsampled indices is of dimension %d. "
                         "Please provide a two-dimensional array instead.\n" % subsamples_matrix.ndim)

    if (data.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(data.sum()) and not np.all(np.isfinite(data))):
        raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n'
                         'the data vector contains at least one infinite or NaN entry.\n')

    if (subsamples_matrix.dtype.type is np.int_ and not np.isfinite(subsamples_matrix.sum()) and not np.all(np.isfinite(subsamples_matrix))):
        raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n' 
                         'the array of subsampled indices contains at least one infinite or NaN entry.\n')

    if not np.all(subsamples_matrix >= 0):
        raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n'
                         'the sampled indices should all be positive integers.\n') 

    N_samples = data.shape[0]
    N_runs, N_subsamples = subsamples_matrix.shape

    if N_subsamples > N_samples:
        raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n'
                         'the number of sampled indices cannot exceed the total number of samples in the whole data-set.\n')

    for i in xrange(N_runs):
        subsamples_matrix[i] = np.unique(subsamples_matrix[i])
 
    if not isinstance(minPts, int):
        raise TypeError("\nERROR: DBSCAN_multiplex @ load:\n"
                        "the parameter 'minPts' must be an integer.\n")

    if minPts < 2:
        raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n"
                         "the value of 'minPts' must be larger than 1.\n")        

    if eps is None:
        # Determine the parameter 'eps' as the median of the distribution
        # of the maximum of the minPts-nearest neighbors distances for each sample.
        if verbose:
            print("INFO: DBSCAN_multiplex @ load:\n"
                  "starting the determination of an appropriate value of 'eps' for this data-set"
                  " and for the other parameter of the DBSCAN algorithm set to {minPts}.\n"
                  "This might take a while.".format(**locals()))

        beg_eps = time.time()

        quantile = np.rint(quantile)
        quantile = np.clip(quantile, 0, 100)

        k_distances = kneighbors_graph(data, minPts, mode = 'distance', metric = metric, p = p).data
 
        radii = np.zeros(N_samples, dtype = float)
        for i in xrange(0, minPts):
            radii = np.maximum(radii, k_distances[i::minPts]) 
             
        if quantile == 50:     
            eps = round(np.median(radii, overwrite_input = True), 4)
        else:
            eps = round(np.percentile(radii, quantile), 4)

        end_eps = time.time()

        if verbose:
            print("\nINFO: DBSCAN_multiplex @ load:\n"
                  "done with evaluating parameter 'eps' from the data-set provided."
                  " This took {} seconds. Value of epsilon: {}.".format(round(end_eps - beg_eps, 4), eps))

    else:
        if not (isinstance(eps, float) or isinstance(eps, int)):
            raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n"
                             "please provide a numeric value for the radius 'eps'.\n")

        if not eps > 0.0:
            raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n"
                             "the radius 'eps' must be positive.\n")

        eps = round(eps, 4)

    # For all samples with a large enough neighborhood, 'neighborhoods_indices' 
    # and 'neighborhoods_indptr' help us find the neighbors to every sample. Note
    # that this definition of neighbors leaves the original point in,
    # which will be considered later.
    if verbose:
        print("\nINFO: DBSCAN_multiplex @ load:\n"
             "identifying the neighbors within an hypersphere of radius {eps} around each sample,"
             " while at the same time evaluating the number of epsilon-neighbors for each sample.\n"
             "This might take a fair amount of time.".format(**locals()))

    beg_neigh = time.time()

    fileh = tables.open_file(hdf5_file_name, mode = 'r+')
    DBSCAN_group = fileh.create_group(fileh.root, 'DBSCAN_group')

    neighborhoods_indices = fileh.create_earray(DBSCAN_group, 'neighborhoods_indices', tables.Int32Atom(), (0,), 
                                                'Indices array for sparse matrix of neighborhoods', 
                                                expectedrows = int((N_samples ** 2) / 50))

    # 'neighborhoods_indptr' is such that for each of row i of the data-matrix 
    # neighborhoods_indices[neighborhoods_indptr[i]:neighborhoods_indptr[i+1]]
    # contains the column indices of row i from the array of 
    # 'eps'-neighborhoods.
    neighborhoods_indptr = np.zeros(1, dtype = np.int64)

    # For each sample, 'neighbors_counts' will keep a tally of the number 
    # of its  neighbors within a hypersphere of radius 'eps'. 
    # Note that the sample itself is counted as part of this neighborhood.
    neighbors_counts = fileh.create_carray(DBSCAN_group, 'neighbors_counts', tables.Int32Atom(), (N_runs, N_samples), 
                                           'Array of the number of neighbors around each sample of a set of subsampled points', 
                                           filters = None)   

    chunks_size = get_chunk_size(N_samples, 3)
    for i in xrange(0, N_samples, chunks_size):
        chunk = data[i:min(i + chunks_size, N_samples)]

        D = pairwise_distances(chunk, data, metric = metric, p = p, n_jobs = 1)
            
        D = (D <= eps)

        if samples_weights is None:
            for run in xrange(N_runs):
                x = subsamples_matrix[run]
                M = np.take(D, x, axis = 1)

                legit_rows = np.intersect1d(i + np.arange(min(chunks_size, N_samples - i)), x, assume_unique = True)
                M = np.take(M, legit_rows - i, axis = 0)
                
                neighbors_counts[run, legit_rows] = M.sum(axis = 1)

                del M
        else:
            for run in xrange(N_runs):
                x = subsamples_matrix[run]

                M = np.take(D, x, axis = 1)

                legit_rows = np.intersect1d(i + np.arange(min(chunks_size, N_samples - i)), x, assume_unique = True)
                M = np.take(M, legit_rows - i, axis = 0)

                neighbors_counts[run, legit_rows] = np.array([np.sum(samples_weights[x[row]]) for row in M])

                del M

        candidates = np.where(D == True)

        del D

        neighborhoods_indices.append(candidates[1])

        _, nbr = np.unique(candidates[0], return_counts = True)
        counts = np.cumsum(nbr) + neighborhoods_indptr[-1]

        del candidates

        neighborhoods_indptr = np.append(neighborhoods_indptr, counts)

    fileh.create_carray(DBSCAN_group, 'neighborhoods_indptr', tables.Int64Atom(), (N_samples + 1,), 
                        'Array of cumulative number of column indices for each row', filters = None)
    fileh.root.DBSCAN_group.neighborhoods_indptr[:] = neighborhoods_indptr[:]

    fileh.create_carray(DBSCAN_group, 'subsamples_matrix', tables.Int32Atom(), (N_runs, N_subsamples), 
                        'Array of subsamples indices', filters = None)
    fileh.root.DBSCAN_group.subsamples_matrix[:] = subsamples_matrix[:]

    fileh.close()

    end_neigh = time.time()

    if verbose:
        print("\nINFO: DBSCAN_multiplex @ load:\n"
              "done with the neighborhoods. This step took {} seconds.".format(round(end_neigh - beg_neigh, 4)))

    gc.collect()

    return eps
Ejemplo n.º 50
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Using size_threshold argument should raise
    # a deprecation warning
    assert_warns(DeprecationWarning,
                 manhattan_distances,
                 X,
                 Y,
                 size_threshold=10)
    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError,
                  pairwise_distances,
                  X,
                  Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
Ejemplo n.º 51
0
 def _calculateDistanceMatrix(self):
     return pairwise_distances(self.emb.x, metric='cosine')
Ejemplo n.º 52
0
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean scikit-learn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])
    D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X,
                                         Y,
                                         metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X,
                                         Y,
                                         metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan")
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)

    # Test batch_size deprecation warning
    assert_warns_message(DeprecationWarning,
                         "version 0.22",
                         pairwise_distances_argmin_min,
                         X,
                         Y,
                         batch_size=500,
                         metric='euclidean')
Ejemplo n.º 53
0
def build_all_2():
    print 'For each class, we build all the trees and save them in CSVs'
    path_to_save = '../data/test/try'
    """
    nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative')
    write_tree_in_csv(nar_trees)    
    
    arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative')
    write_tree_in_csv(arg_trees) 
     
    inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative')
    write_tree_in_csv(inf_trees) 
    
    des_trees = []

    # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier.
    all_trees = nar_trees + arg_trees + inf_trees + des_trees
    int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'}

    T = [t[0] for t in all_trees]
    pickle.dump(T,open(path_to_save+'trees.pkl','wb'))"""
    T = pickle.load(open('../data/trees_with_labels.pkl','r'))
    T = [t[0] for t in T]

    """y_nar = [0 for t in nar_trees]
    y_arg = [1 for t in arg_trees]
    y_inf = [2 for t in inf_trees]
    y_des = [3 for t in des_trees]
    y = np.array( y_nar + y_arg + y_inf + y_des )
    pickle.dump(y,open(path_to_save+'labels.pkl','wb'))"""
    
    index = ['bin','count','norm','height','tfid']

    print 'Dicts'
    D_bin = vectorizers.build_bin_vects(T)
    D_count = vectorizers.build_count_vects(T)
    D_norm = vectorizers.build_norm_vects(T)
    D_height = vectorizers.build_height_vects(T)
    D_tfid = vectorizers.build_tfid_vects(T)
    
    D_all = {'bin':D_bin ,'count': D_count,'norm': D_norm,'height': D_height,'tfid': D_tfid}
    pickle.dump(D_all,open(path_to_save+'dicts.pkl','wb'))
    

    print 'Vects'
    vectorizer = feature_extraction.DictVectorizer(sparse=False)
    V_bin = vectorizer.fit_transform(D_bin)
    V_count = vectorizer.fit_transform(D_count)
    V_norm = vectorizer.fit_transform(D_norm)
    V_height = vectorizer.fit_transform(D_height)
    V_tfid = vectorizer.fit_transform(D_tfid)

    V_all = {'bin':V_bin ,'count': V_count,'norm': V_norm,'height': V_height,'tfid': V_tfid}
    pickle.dump(V_all,open(path_to_save+'vects.pkl','wb'))
    
    #Y = vectorizer.inverse_transform(V_bin)



    print 'Kernels'

    ## tree kernels
    #max_depth = 15
    #T_p = [ctree.prune(t,max_depth) for t in T]
    #K_tree = kernels.compute_gram(T_p,T_p,kernels.tree_kernel)
    #pickle.dump(K_tree,open(path_to_save+'tree_kernel.pkl'))

    print 'vector kernels'
    print 'linear'
    K_bin_lin = pairwise.linear_kernel(V_bin)
    K_count_lin = pairwise.linear_kernel(V_count)
    K_norm_lin = pairwise.linear_kernel(V_norm)
    K_height_lin = pairwise.linear_kernel(V_height)
    K_tfid_lin = pairwise.linear_kernel(V_tfid)
    K_all_lin = {'bin':K_bin_lin, 'count':K_count_lin, 'norm':K_norm_lin, 'height':K_height_lin, 'tfid':K_tfid_lin}
    print 'rbf'
    K_bin_rbf = pairwise.rbf_kernel(V_bin)
    K_count_rbf = pairwise.rbf_kernel(V_count)
    K_norm_rbf = pairwise.rbf_kernel(V_norm)
    K_height_rbf = pairwise.rbf_kernel(V_height)
    K_tfid_rbf = pairwise.rbf_kernel(V_tfid)
    K_all_rbf = {'bin':K_bin_rbf, 'count':K_count_rbf, 'norm':K_norm_rbf, 'height':K_height_rbf, 'tfid':K_tfid_rbf}
    print 'cosine sim'
    K_bin_cos_sim = pairwise.cosine_similarity(V_bin)
    K_count_cos_sim = pairwise.cosine_similarity(V_count)
    K_norm_cos_sim = pairwise.cosine_similarity(V_norm)
    K_height_cos_sim = pairwise.cosine_similarity(V_height)
    K_tfid_cos_sim = pairwise.cosine_similarity(V_tfid)
    K_all_cos_sim = {'bin':K_bin_cos_sim, 'count':K_count_cos_sim, 'norm':K_norm_cos_sim, 'height':K_height_cos_sim, 'tfid':K_tfid_cos_sim}
    print 'euclidean distance'
    K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean')
    K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean')
    K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean')
    K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean')
    K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean')
    K_all_eucl_dist = {'bin':K_bin_eucl_dist, 'count':K_count_eucl_dist, 'norm':K_norm_eucl_dist, 'height':K_height_eucl_dist, 'tfid':K_tfid_eucl_dist}
    print 'minkowski distance'
    K_bin_mink_dist = pairwise.pairwise_distances(V_bin,metric='minkowski')
    K_count_mink_dist = pairwise.pairwise_distances(V_count,metric='minkowski')
    K_norm_mink_dist = pairwise.pairwise_distances(V_norm,metric='minkowski')
    K_height_mink_dist = pairwise.pairwise_distances(V_height,metric='minkowski')
    K_tfid_mink_dist = pairwise.pairwise_distances(V_tfid,metric='minkowski')
    K_all_mink_dist = {'bin':K_bin_mink_dist, 'count':K_count_mink_dist, 'norm':K_norm_mink_dist, 'height':K_height_mink_dist, 'tfid':K_tfid_mink_dist}


    K_all = {'lin':K_all_lin, 'rbf':K_all_rbf, 'cos_sim':K_all_cos_sim,'eucl_dist':K_all_eucl_dist,'mink_dist':K_all_mink_dist}
    pickle.dump(K_all,open(path_to_save+'vect_kernels.pkl','wb'))
    print "done"
Ejemplo n.º 54
0
def test_pairwise_callable_nonstrict_metric():
    # paired_distances should allow callable metric where metric(x, x) != 0
    # Knowing that the callable is a strict metric would allow the diagonal to
    # be left uncalculated and set to 0.
    assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5)
Ejemplo n.º 55
0
def furthest_sample_pts(pts_input):
    D = pairwise_distances(pts_input, metric='euclidean')
    (perm, lambdas) = getGreedyPerm(D)
    return perm, lambdas
Ejemplo n.º 56
0
def gaus_feats(diags, centers, inertias, eps=1e-10):
    return np.exp(-pairwise.pairwise_distances(diags, Y=centers) /
                  (inertias + eps))
Ejemplo n.º 57
0
def lapl_feats(diags, centers, inertias, eps=1e-10):
    return np.exp(-np.sqrt(
        pairwise.pairwise_distances(diags, Y=centers) / (inertias + eps)))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]


# In[91]:


pd.DataFrame(data=user_prediction).head()
pd.DataFrame(item_prediction).head()


# In[81]:


from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine') 
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')


# In[88]:


#for user based cf - prediction(u,i) = sigma(r(u,i)*similarity(u,v))/sigma(sim(u,v)) where u,v are user, i is items
#for item based cf - prediction (u,i) = sigma(R(u,N) * similarity(i,N))/sigma(sim(i,N)) where 

def predict(ratings, similarity, type='user'):    
    if type == 'user':        
        mean_user_rating = ratings.mean(axis=1)               
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])        
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T    
    elif type == 'item':       
Ejemplo n.º 59
0
def construct_W(X, **kwargs):
    """
    Construct the affinity matrix W through different ways
    Notes
    -----
    if kwargs is null, use the default parameter settings;
    if kwargs is not null, construct the affinity matrix according to parameters in kwargs
    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        parameters to construct different affinity matrix W:
        y: {numpy array}, shape (n_samples, 1)
            the true label information needed under the 'supervised' neighbor mode
        metric: {string}
            choices for different distance measures
            'euclidean' - use euclidean distance
            'cosine' - use cosine distance (default)
        neighbor_mode: {string}
            indicates how to construct the graph
            'knn' - put an edge between two nodes if and only if they are among the
                    k nearest neighbors of each other (default)
            'supervised' - put an edge between two nodes if they belong to same class
                    and they are among the k nearest neighbors of each other
        weight_mode: {string}
            indicates how to assign weights for each edge in the graph
            'binary' - 0-1 weighting, every edge receives weight of 1 (default)
            'heat_kernel' - if nodes i and j are connected, put weight W_ij = exp(-norm(x_i - x_j)/2t^2)
                            this weight mode can only be used under 'euclidean' metric and you are required
                            to provide the parameter t
            'cosine' - if nodes i and j are connected, put weight cosine(x_i,x_j).
                        this weight mode can only be used under 'cosine' metric
        k: {int}
            choices for the number of neighbors (default k = 5)
        t: {float}
            parameter for the 'heat_kernel' weight_mode
        fisher_score: {boolean}
            indicates whether to build the affinity matrix in a fisher score way, in which W_ij = 1/n_l if yi = yj = l;
            otherwise W_ij = 0 (default fisher_score = false)
        reliefF: {boolean}
            indicates whether to build the affinity matrix in a reliefF way, NH(x) and NM(x,y) denotes a set of
            k nearest points to x with the same class as x, and a different class (the class y), respectively.
            W_ij = 1 if i = j; W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y) (default reliefF = false)
    Output
    ------
    W: {sparse matrix}, shape (n_samples, n_samples)
        output affinity matrix W
    """

    # default metric is 'cosine'
    if 'metric' not in kwargs.keys():
        kwargs['metric'] = 'cosine'

    # default neighbor mode is 'knn' and default neighbor size is 5
    if 'neighbor_mode' not in kwargs.keys():
        kwargs['neighbor_mode'] = 'knn'
    if kwargs['neighbor_mode'] == 'knn' and 'k' not in kwargs.keys():
        kwargs['k'] = 5
    if kwargs['neighbor_mode'] == 'supervised' and 'k' not in kwargs.keys():
        kwargs['k'] = 5
    if kwargs['neighbor_mode'] == 'supervised' and 'y' not in kwargs.keys():
        print('Warning: label is required in the supervised neighborMode!!!')
        exit(0)

    # default weight mode is 'binary', default t in heat kernel mode is 1
    if 'weight_mode' not in kwargs.keys():
        kwargs['weight_mode'] = 'binary'
    if kwargs['weight_mode'] == 'heat_kernel':
        if kwargs['metric'] != 'euclidean':
            kwargs['metric'] = 'euclidean'
        if 't' not in kwargs.keys():
            kwargs['t'] = 1
    elif kwargs['weight_mode'] == 'cosine':
        if kwargs['metric'] != 'cosine':
            kwargs['metric'] = 'cosine'

    # default fisher_score and reliefF mode are 'false'
    if 'fisher_score' not in kwargs.keys():
        kwargs['fisher_score'] = False
    if 'reliefF' not in kwargs.keys():
        kwargs['reliefF'] = False

    n_samples, n_features = np.shape(X)

    # choose 'knn' neighbor mode
    if kwargs['neighbor_mode'] == 'knn':
        k = kwargs['k']
        if kwargs['weight_mode'] == 'binary':
            if kwargs['metric'] == 'euclidean':
                # compute pairwise euclidean distances
                D = pairwise_distances(X)
                D **= 2
                # sort the distance matrix D in ascending order
                dump = np.sort(D, axis=1)
                idx = np.argsort(D, axis=1)
                # choose the k-nearest neighbors for each instance
                idx_new = idx[:, 0:k + 1]
                G = np.zeros((n_samples * (k + 1), 3))
                G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1)
                G[:, 1] = np.ravel(idx_new, order='F')
                G[:, 2] = 1
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                               shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

            elif kwargs['metric'] == 'cosine':
                # normalize the data first
                X_normalized = np.power(np.sum(X * X, axis=1), 0.5)
                for i in range(n_samples):
                    X[i, :] = X[i, :] / max(1e-12, X_normalized[i])
                # compute pairwise cosine distances
                D_cosine = np.dot(X, np.transpose(X))
                # sort the distance matrix D in descending order
                dump = np.sort(-D_cosine, axis=1)
                idx = np.argsort(-D_cosine, axis=1)
                idx_new = idx[:, 0:k + 1]
                G = np.zeros((n_samples * (k + 1), 3))
                G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1)
                G[:, 1] = np.ravel(idx_new, order='F')
                G[:, 2] = 1
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                               shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

        elif kwargs['weight_mode'] == 'heat_kernel':
            t = kwargs['t']
            # compute pairwise euclidean distances
            D = pairwise_distances(X)
            D **= 2
            # sort the distance matrix D in ascending order
            dump = np.sort(D, axis=1)
            idx = np.argsort(D, axis=1)
            idx_new = idx[:, 0:k + 1]
            dump_new = dump[:, 0:k + 1]
            # compute the pairwise heat kernel distances
            dump_heat_kernel = np.exp(-dump_new / (2 * t * t))
            G = np.zeros((n_samples * (k + 1), 3))
            G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1)
            G[:, 1] = np.ravel(idx_new, order='F')
            G[:, 2] = np.ravel(dump_heat_kernel, order='F')
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                           shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

        elif kwargs['weight_mode'] == 'cosine':
            # normalize the data first
            X_normalized = np.power(np.sum(X * X, axis=1), 0.5)
            for i in range(n_samples):
                X[i, :] = X[i, :] / max(1e-12, X_normalized[i])
            # compute pairwise cosine distances
            D_cosine = np.dot(X, np.transpose(X))
            # sort the distance matrix D in ascending order
            dump = np.sort(-D_cosine, axis=1)
            idx = np.argsort(-D_cosine, axis=1)
            idx_new = idx[:, 0:k + 1]
            dump_new = -dump[:, 0:k + 1]
            G = np.zeros((n_samples * (k + 1), 3))
            G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1)
            G[:, 1] = np.ravel(idx_new, order='F')
            G[:, 2] = np.ravel(dump_new, order='F')
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                           shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

    # choose supervised neighborMode
    elif kwargs['neighbor_mode'] == 'supervised':
        k = kwargs['k']
        # get true labels and the number of classes
        y = kwargs['y']
        label = np.unique(y)
        n_classes = np.unique(y).size
        # construct the weight matrix W in a fisherScore way, W_ij = 1/n_l if yi = yj = l, otherwise W_ij = 0
        if kwargs['fisher_score'] is True:
            W = lil_matrix((n_samples, n_samples))
            for i in range(n_classes):
                class_idx = (y == label[i])
                class_idx_all = (class_idx[:, np.newaxis]
                                 & class_idx[np.newaxis, :])
                W[class_idx_all] = 1.0 / np.sum(np.sum(class_idx))
            return W

        # construct the weight matrix W in a reliefF way, NH(x) and NM(x,y) denotes a set of k nearest
        # points to x with the same class as x, a different class (the class y), respectively. W_ij = 1 if i = j;
        # W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y)
        if kwargs['reliefF'] is True:
            # when xj in NH(xi)
            G = np.zeros((n_samples * (k + 1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                D = pairwise_distances(X[class_idx, :])
                D **= 2
                idx = np.argsort(D, axis=1)
                idx_new = idx[:, 0:k + 1]
                n_smp_class = (class_idx[idx_new[:]]).size
                if len(class_idx) <= k:
                    k = len(class_idx) - 1
                G[id_now:n_smp_class + id_now,
                  0] = np.tile(class_idx, (k + 1, 1)).reshape(-1)
                G[id_now:n_smp_class + id_now,
                  1] = np.ravel(class_idx[idx_new[:]], order='F')
                G[id_now:n_smp_class + id_now, 2] = 1.0 / k
                id_now += n_smp_class
            W1 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                            shape=(n_samples, n_samples))
            # when i = j, W_ij = 1
            for i in range(n_samples):
                W1[i, i] = 1
            # when x_j in NM(x_i, y)
            G = np.zeros((n_samples * k * (n_classes - 1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx1 = np.column_stack(np.where(y == label[i]))[:, 0]
                X1 = X[class_idx1, :]
                for j in range(n_classes):
                    if label[j] != label[i]:
                        class_idx2 = np.column_stack(
                            np.where(y == label[j]))[:, 0]
                        X2 = X[class_idx2, :]
                        D = pairwise_distances(X1, X2)
                        idx = np.argsort(D, axis=1)
                        idx_new = idx[:, 0:k]
                        n_smp_class = len(class_idx1) * k
                        G[id_now:n_smp_class + id_now,
                          0] = np.tile(class_idx1, (k, 1)).reshape(-1)
                        G[id_now:n_smp_class + id_now,
                          1] = np.ravel(class_idx2[idx_new[:]], order='F')
                        G[id_now:n_smp_class + id_now,
                          2] = -1.0 / ((n_classes - 1) * k)
                        id_now += n_smp_class
            W2 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                            shape=(n_samples, n_samples))
            bigger = np.transpose(W2) > W2
            W2 = W2 - W2.multiply(bigger) + np.transpose(W2).multiply(bigger)
            W = W1 + W2
            return W

        if kwargs['weight_mode'] == 'binary':
            if kwargs['metric'] == 'euclidean':
                G = np.zeros((n_samples * (k + 1), 3))
                id_now = 0
                for i in range(n_classes):
                    class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                    # compute pairwise euclidean distances for instances in class i
                    D = pairwise_distances(X[class_idx, :])
                    D **= 2
                    # sort the distance matrix D in ascending order for instances in class i
                    idx = np.argsort(D, axis=1)
                    idx_new = idx[:, 0:k + 1]
                    n_smp_class = len(class_idx) * (k + 1)
                    G[id_now:n_smp_class + id_now,
                      0] = np.tile(class_idx, (k + 1, 1)).reshape(-1)
                    G[id_now:n_smp_class + id_now,
                      1] = np.ravel(class_idx[idx_new[:]], order='F')
                    G[id_now:n_smp_class + id_now, 2] = 1
                    id_now += n_smp_class
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                               shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

            if kwargs['metric'] == 'cosine':
                # normalize the data first
                X_normalized = np.power(np.sum(X * X, axis=1), 0.5)
                for i in range(n_samples):
                    X[i, :] = X[i, :] / max(1e-12, X_normalized[i])
                G = np.zeros((n_samples * (k + 1), 3))
                id_now = 0
                for i in range(n_classes):
                    class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                    # compute pairwise cosine distances for instances in class i
                    D_cosine = np.dot(X[class_idx, :],
                                      np.transpose(X[class_idx, :]))
                    # sort the distance matrix D in descending order for instances in class i
                    idx = np.argsort(-D_cosine, axis=1)
                    idx_new = idx[:, 0:k + 1]
                    n_smp_class = len(class_idx) * (k + 1)
                    G[id_now:n_smp_class + id_now,
                      0] = np.tile(class_idx, (k + 1, 1)).reshape(-1)
                    G[id_now:n_smp_class + id_now,
                      1] = np.ravel(class_idx[idx_new[:]], order='F')
                    G[id_now:n_smp_class + id_now, 2] = 1
                    id_now += n_smp_class
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                               shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

        elif kwargs['weight_mode'] == 'heat_kernel':
            G = np.zeros((n_samples * (k + 1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                # compute pairwise cosine distances for instances in class i
                D = pairwise_distances(X[class_idx, :])
                D **= 2
                # sort the distance matrix D in ascending order for instances in class i
                dump = np.sort(D, axis=1)
                idx = np.argsort(D, axis=1)
                idx_new = idx[:, 0:k + 1]
                dump_new = dump[:, 0:k + 1]
                t = kwargs['t']
                # compute pairwise heat kernel distances for instances in class i
                dump_heat_kernel = np.exp(-dump_new / (2 * t * t))
                n_smp_class = len(class_idx) * (k + 1)
                G[id_now:n_smp_class + id_now,
                  0] = np.tile(class_idx, (k + 1, 1)).reshape(-1)
                G[id_now:n_smp_class + id_now,
                  1] = np.ravel(class_idx[idx_new[:]], order='F')
                G[id_now:n_smp_class + id_now, 2] = np.ravel(dump_heat_kernel,
                                                             order='F')
                id_now += n_smp_class
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                           shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

        elif kwargs['weight_mode'] == 'cosine':
            # normalize the data first
            X_normalized = np.power(np.sum(X * X, axis=1), 0.5)
            for i in range(n_samples):
                X[i, :] = X[i, :] / max(1e-12, X_normalized[i])
            G = np.zeros((n_samples * (k + 1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                # compute pairwise cosine distances for instances in class i
                D_cosine = np.dot(X[class_idx, :],
                                  np.transpose(X[class_idx, :]))
                # sort the distance matrix D in descending order for instances in class i
                dump = np.sort(-D_cosine, axis=1)
                idx = np.argsort(-D_cosine, axis=1)
                idx_new = idx[:, 0:k + 1]
                dump_new = -dump[:, 0:k + 1]
                n_smp_class = len(class_idx) * (k + 1)
                G[id_now:n_smp_class + id_now,
                  0] = np.tile(class_idx, (k + 1, 1)).reshape(-1)
                G[id_now:n_smp_class + id_now,
                  1] = np.ravel(class_idx[idx_new[:]], order='F')
                G[id_now:n_smp_class + id_now, 2] = np.ravel(dump_new,
                                                             order='F')
                id_now += n_smp_class
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])),
                           shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W
Ejemplo n.º 60
0
def test_weighted_dbscan():
    # ensure sample_weight is validated
    with pytest.raises(ValueError):
        dbscan([[0], [1]], sample_weight=[2])
    with pytest.raises(ValueError):
        dbscan([[0], [1]], sample_weight=[2, 3, 4])

    # ensure sample_weight has an effect
    assert_array_equal(
        [], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]
    )
    assert_array_equal(
        [], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]
    )
    assert_array_equal(
        [0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]
    )
    assert_array_equal(
        [0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
    )

    # points within eps of each other:
    assert_array_equal(
        [0, 1],
        dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0],
    )
    # and effect of non-positive and non-integer sample_weight:
    assert_array_equal(
        [], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
    )
    assert_array_equal(
        [0, 1],
        dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[
            0
        ],  # noqa
    )
    assert_array_equal(
        [0, 1],
        dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0],
    )
    assert_array_equal(
        [],
        dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[
            0
        ],  # noqa
    )

    # for non-negative sample_weight, cores should be identical to repetition
    rng = np.random.RandomState(42)
    sample_weight = rng.randint(0, 5, X.shape[0])
    core1, label1 = dbscan(X, sample_weight=sample_weight)
    assert len(label1) == len(X)

    X_repeated = np.repeat(X, sample_weight, axis=0)
    core_repeated, _ = dbscan(X_repeated)
    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
    core_repeated_mask[core_repeated] = True
    core_mask = np.zeros(X.shape[0], dtype=bool)
    core_mask[core1] = True
    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)

    # sample_weight should work with precomputed distance matrix
    D = pairwise_distances(X)
    core3, label3 = dbscan(
        D, sample_weight=sample_weight, metric="precomputed"
    )  # noqa
    assert_array_equal(core1, core3)
    assert_array_equal(label1, label3)

    # sample_weight should work with estimator
    est = DBSCAN().fit(X, sample_weight=sample_weight)
    core4 = est.core_sample_indices_
    label4 = est.labels_
    assert_array_equal(core1, core4)
    assert_array_equal(label1, label4)

    est = DBSCAN()
    label5 = est.fit_predict(X, sample_weight=sample_weight)
    core5 = est.core_sample_indices_
    assert_array_equal(core1, core5)
    assert_array_equal(label1, label5)
    assert_array_equal(label1, est.labels_)