コード例 #1
0
ファイル: test_hierarchy.py プロジェクト: ElDeveloper/scipy
def test_optimal_leaf_ordering():
    # test with the distance vector y
    Z = optimal_leaf_ordering(linkage(hierarchy_test_data.ytdist),
                              hierarchy_test_data.ytdist)
    expectedZ = hierarchy_test_data.linkage_ytdist_single_olo
    assert_allclose(Z, expectedZ, atol=1e-10)

    # test with the observation matrix X
    Z = optimal_leaf_ordering(linkage(hierarchy_test_data.X, 'ward'),
                              hierarchy_test_data.X)
    expectedZ = hierarchy_test_data.linkage_X_ward_olo
    assert_allclose(Z, expectedZ, atol=1e-06)
コード例 #2
0
    def ordered_indices(self):

        # WARNING: this function is useful only when the detected templates are strictly equal to the generated
        # templates, otherwise it does not make any sense to run a hierarchical clustering on the similarity matrix.
        assert self._cells_true.nb_cells == self._cells_pred.nb_cells
        nb_cells = self._cells_true.nb_cells

        if self._ordered_indices is None:

            if nb_cells > 1:
                metric = 'correlation'
                # Define the distance matrix.
                distances = pdist(self._similarities, metric=metric)
                # Perform hierarchical/agglomerative clustering.
                linkages = linkage(distances, method='single', metric=metric)
                # Reorder templates.
                linkages_ordered = optimal_leaf_ordering(linkages,
                                                         distances,
                                                         metric=metric)
                # Extract ordered list.
                self._ordered_indices = leaves_list(linkages_ordered)
            else:
                self._ordered_indices = np.arange(0, nb_cells)

        return self._ordered_indices
コード例 #3
0
ファイル: base.py プロジェクト: IsabelF98/PRJ_Vigilance_Smk02
def plot_fc_matrix(ts, labels, reorder='single', width=400, cmap='RdBu_r'):
    '''This function sorts FC matrix using the same algorithm as nilearn.plotting.plot_matrix
    INPUTS
    ------
    ts: ROI timeseries as pd.DataFrame
    labels: ROI names as string array
    reorder: method for re-ordering the matrix. Possible values: single, complete, average (see nilearn help for more information)
    width: size of output figure
    cmap: colormap for the connectivity matrix
    
    OUTPUTS
    -------
    hv.heatmap with the connectivity matrix
    '''
    mat = ts.corr().values
    if reorder != False:
        linkage_matrix = linkage(mat, method=reorder)
        ordered_linkage = optimal_leaf_ordering(linkage_matrix, mat)
        index = leaves_list(ordered_linkage)
        # make sure labels is an ndarray and copy it
        labels = np.array(labels).copy()
        mat = mat.copy()
        # and reorder labels and matrix
        labels = labels[index].tolist()
        mat = mat[index, :][:, index]
    fc_matrix = pd.DataFrame(mat, index=labels, columns=labels)
    return fc_matrix.hvplot.heatmap(
        cmap=cmap,
        aspect='square').redim.range(value=(-1, 1)).opts(xrotation=45,
                                                         frame_width=width)
コード例 #4
0
def global_cluster_linkage(metric='euclidean',
                           method='ward',
                           directionalised=True):
    matrix, __, __ = load_matrix()
    if not directionalised:
        matrix = matrix.mean(level=1, axis=1)

    linkage = fastcluster.linkage(matrix.T, metric=metric, method=method)

    linkage = hierarchy.optimal_leaf_ordering(linkage, matrix.T, metric=metric)

    return linkage
コード例 #5
0
def computeOrder(df,
                 optimal=True,
                 dist_method="euclidean",
                 cluster_method="average"):

    dist_mat = pdist(df, metric=dist_method)
    link_mat = hierarchy.linkage(dist_mat, method=cluster_method)

    if optimal==True:
        return hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(link_mat, dist_mat))
    else:
        return hierarchy.leaves_list(link_mat)
コード例 #6
0
ファイル: utils.py プロジェクト: zivzone/tensortools
def hclust_linearize(U):
    """Sorts the rows of a matrix by hierarchical clustering.

    Parameters:
        U (ndarray) : matrix of data

    Returns:
        prm (ndarray) : permutation of the rows
    """

    from scipy.cluster import hierarchy
    Z = hierarchy.ward(U)
    return hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, U))
コード例 #7
0
def abs_optimal_leaf_ordering(correlation_matrix):

    # D = pdist(correlation_matrix, 'euclidean')
    # Z = linkage(D, 'ward')
    # optimal_Z = optimal_leaf_ordering(Z, D)
    # N = len(correlation_matrix)
    # new_order = seriation(optimal_Z, N, N + N - 2)
    # new = [correlation_matrix.index[i] for i in new_order]

    X = correlation_matrix  # np.random.randn(10,10)
    Z = hierarchy.ward(X)
    hierarchy.leaves_list(Z)

    new_order = hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))
    new = [correlation_matrix.index[i] for i in new_order]
    return new
コード例 #8
0
def _reorder_dendrogram(z, dists, leaf_ordering):

    if leaf_ordering == 'optimal':
        z = optimal_leaf_ordering(z, dists)
        h = leaves_list(z)

    elif leaf_ordering == 'count_sort_ascending':
        r = dendrogram(z,
                       get_leaves=True,
                       count_sort='ascending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    elif leaf_ordering == 'count_sort_descending':
        r = dendrogram(z,
                       get_leaves=True,
                       count_sort='descending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    elif leaf_ordering == 'distance_sort_ascending':
        r = dendrogram(z,
                       get_leaves=True,
                       distance_sort='ascending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    elif leaf_ordering == 'distance_sort_descending':
        r = dendrogram(z,
                       get_leaves=True,
                       distance_sort='descending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    else:
        raise ValueError('Unsupported leaf ordering')

    return h
コード例 #9
0
def cluster_kmers(kmers):
    """

    :param kmers: numpy.ndarray of kmer counts
    :return: ndarray linkage matrix
    """

    #see https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

    #Note: We are splitting up the operations of linkage for clarity and benchmarking, these could be combined into
    #one call to linkage

    #compute the pairwise distance between sequences.
    y = pdist(kmers, metric='correlation')
    Z = linkage(y, method='ward', optimal_ordering=False)
    Z_ordered = optimal_leaf_ordering(Z, y)

    return Z_ordered
コード例 #10
0
ファイル: visualize.py プロジェクト: ruggleslab/hypercluster
def compute_order(df,
                  dist_method: str = "euclidean",
                  cluster_method: str = "average"):
    """Gives hierarchical clustering order for the rows of a DataFrame 

    Args: 
        df (DataFrame): DataFrame with rows to order.  
        dist_method (str):  Distance method to pass to scipy.cluster.hierarchy.linkage.  
        cluster_method (str): Clustering method to pass to scipy.spatial.distance.pdist.  

    Returns (pandas.Index): 
        Ordered row index. 

    """
    dist_mat = pdist(df, metric=dist_method)
    link_mat = hierarchy.linkage(dist_mat, method=cluster_method)

    return df.index[hierarchy.leaves_list(
        hierarchy.optimal_leaf_ordering(link_mat, dist_mat))]
コード例 #11
0
ファイル: _inspection.py プロジェクト: danielschulz/facet
    def __sort_affinity_matrices(
        affinity_matrices: List[pd.DataFrame],
        symmetrical_affinity_matrices: np.ndarray,
    ) -> List[pd.DataFrame]:
        # abbreviate a very long function name to stay within the permitted line length
        fn_linkage = LearnerInspector.__linkage_matrix_from_affinity_matrix_for_output

        return [
            affinity_matrix.iloc[feature_order, feature_order]
            for affinity_matrix, symmetrical_affinity_matrix in zip(
                affinity_matrices, symmetrical_affinity_matrices)
            for feature_order in (
                leaves_list(Z=optimal_leaf_ordering(
                    Z=fn_linkage(
                        feature_affinity_matrix=symmetrical_affinity_matrix),
                    y=symmetrical_affinity_matrix,
                ))
                # reverse the index list so larger values tend to end up on top
                [::-1], )
        ]
コード例 #12
0
def optimal_hierarchical_cluster(mat: np.array,
                                 method: str = "ward") -> np.array:
    """
    Calculates the optimal clustering of a matrix.

    It calculates the hierarchy clusters from the distance of the matrix. Then it calculates
    the optimal leaf ordering of the hierarchy clusters, and returns the optimally clustered matrix.

    It is reproduced with modifications from the following blog post:
    `Marti, G. (2020) TF 2.0 DCGAN for 100x100 financial correlation matrices [Online].
    Available at: https://marti.ai/ml/2019/10/13/tf-dcgan-financial-correlation-matrices.html.
    (Accessed: 17 Aug 2020)
    <https://marti.ai/ml/2019/10/13/tf-dcgan-financial-correlation-matrices.html>`_

    This method relies and acts as a wrapper for the `scipy.cluster.hierarchy` module.
    `<https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html>`_

    :param mat: (np.array/pd.DataFrame) Correlation matrix.
    :param method: (str) Method to calculate the hierarchy clusters. Can take the values
        ["single", "complete", "average", "weighted", "centroid", "median", "ward"].
    :return: (np.array) Optimal hierarchy cluster matrix.
    """

    if isinstance(mat, pd.DataFrame):
        mat = mat.values

    # Calculate distance.
    dist = 1 - mat

    # Arrange with hierarchical clustering by maximizing the sum of the
    # similarities between adjacent leaves.
    tri_rows, tri_cols = np.triu_indices(len(mat), k=1)
    linkage_mat = hierarchy.linkage(dist[tri_rows, tri_cols], method=method)
    optimal_leaves = hierarchy.optimal_leaf_ordering(linkage_mat,
                                                     dist[tri_rows, tri_cols])
    optimal_ordering = hierarchy.leaves_list(optimal_leaves)
    ordered_corr = dist[optimal_ordering, :][:, optimal_ordering]

    # Extra substraction is needed to take into account earlier distance calculation.
    return 1 - ordered_corr
コード例 #13
0
def _reorder_matrix(mat, labels, reorder):
    """Helper function for plot_matrix.

    This function reorders the provided matrix.
    """
    if not labels:
        raise ValueError("Labels are needed to show the reordering.")
    try:
        from scipy.cluster.hierarchy import (linkage, optimal_leaf_ordering,
                                             leaves_list)
    except ImportError:
        raise ImportError("A scipy version of at least 1.0 is needed for "
                          "ordering the matrix with optimal_leaf_ordering.")
    linkage_matrix = linkage(mat, method=reorder)
    ordered_linkage = optimal_leaf_ordering(linkage_matrix, mat)
    index = leaves_list(ordered_linkage)
    # make sure labels is an ndarray and copy it
    labels = np.array(labels).copy()
    mat = mat.copy()
    # and reorder labels and matrix
    labels = labels[index].tolist()
    mat = mat[index, :][:, index]
    return mat, labels
コード例 #14
0
ファイル: utils_u.py プロジェクト: tnakada/DeepTCR
def rad_plot(X_2,
             sample_id,
             samples,
             labels,
             color_dict,
             self=None,
             pairwise_distances=None,
             gridsize=50,
             n_pad=5,
             lw=None,
             dg_radius=0.2,
             axes_radius=0.4,
             figsize=8,
             log_scale=False,
             linkage_method='complete',
             filename=None,
             sample_labels=False,
             gaussian_sigma=0.5,
             vmax=0.01):
    # set line width
    if lw is None:
        lw = n_pad / 2

    # number of samplea
    n_s = len(np.unique(samples))

    # min max of input 2D data
    d_max = np.max(X_2, axis=0)
    d_min = np.min(X_2, axis=0)

    # set step and edges of bins for 2d hist
    x_step = (d_max[0] - d_min[0]) / gridsize
    x_edges = np.linspace(d_min[0] - (n_pad * x_step),
                          d_max[0] + (n_pad * x_step),
                          gridsize + (2 * n_pad) + 1)
    y_step = (d_max[1] - d_min[1]) / gridsize
    y_edges = np.linspace(d_min[1] - (n_pad * y_step),
                          d_max[1] + (n_pad * y_step),
                          gridsize + (2 * n_pad) + 1)
    Y, X = np.meshgrid(x_edges[:-1] + (np.diff(x_edges) / 2),
                       y_edges[:-1] + (np.diff(y_edges) / 2))

    # construct 2d smoothed histograms for each sample
    H = list()
    for i in range(n_s):
        # get sample instance data
        smp_d = X_2[sample_id == samples[i]]
        # get counts
        h, _ = np.histogramdd(smp_d, bins=[x_edges, y_edges])
        if log_scale:
            h = np.log(h + 1)
        # normalize and smooth
        H.append(ndi.gaussian_filter(h / np.sum(h), sigma=gaussian_sigma))
    H = np.stack(H, axis=2)

    # center and radius of circle
    e_c = np.array([np.mean(X[:, 0]), np.mean(Y[0, :])])
    e_r = np.abs(
        np.array([Y[-n_pad + 2, 0] - e_c[1], X[0, -n_pad + 2] - e_c[0]]))
    xlim = [X[0, 0] - (y_step * 2), X[-1, 0] + (y_step * 2)]
    ylim = [Y[0, 0] - (x_step * 2), Y[0, -1] + (x_step * 2)]

    if pairwise_distances is None:
        pairwise_distances = pdist(H.reshape([-1, H.shape[2]]).T,
                                   metric='jensenshannon')

    Z = optimal_leaf_ordering(
        linkage(pairwise_distances, method=linkage_method), pairwise_distances)
    dg_order = leaves_list(Z)

    fig = plt.figure(figsize=[figsize, figsize])
    axes_pos = pol2cart(np.linspace(0, 2 * np.pi, n_s + 1),
                        rho=axes_radius) + 0.5
    axes_size = axes_radius * np.sin(0.5 * (2 * np.pi / n_s))
    ax = [None] * n_s

    cmap_viridis = plt.get_cmap('viridis')
    cmap_viridis.set_under(color='white', alpha=0)
    c_mask = np.meshgrid(np.arange(2 * n_pad + gridsize),
                         np.arange(2 * n_pad + gridsize))
    c_mask = np.sqrt(((c_mask[0] - ((2 * n_pad + gridsize) / 2))**2) +
                     ((c_mask[1] -
                       ((2 * n_pad + gridsize) / 2))**2)) >= (0.95 * (
                           (2 * n_pad + gridsize) / 2))

    for i in range(n_s):
        ax[i] = fig.add_axes([
            axes_pos[i, 0] - axes_size, axes_pos[i, 1] - axes_size,
            2 * axes_size, 2 * axes_size
        ])

        if sample_labels:
            ax[i].text(.5,
                       0.2,
                       samples[dg_order[i]],
                       horizontalalignment='center',
                       transform=ax[i].transAxes)

        ax[i].pcolormesh(X,
                         Y,
                         np.ma.masked_array(H[:, :, dg_order[i]], c_mask),
                         cmap=cmap_viridis,
                         shading='gouraud',
                         vmin=0,
                         vmax=vmax)
        ax[i].add_artist(
            Ellipse(e_c,
                    width=2 * e_r[1],
                    height=2 * e_r[0],
                    color=color_dict[labels[dg_order[i]]],
                    fill=False,
                    lw=lw))
        ax[i].set(xticks=[], yticks=[], xlim=xlim, ylim=ylim, frame_on=False)

    dg = dendrogram(Z, no_plot=True)
    polar_dendrogram(dg, fig, ax_radius=dg_radius, log_scale=log_scale)
    if filename is not None:
        plt.savefig(os.path.join(self.directory_results, filename))

    return H
コード例 #15
0
 def get_order(self, data):
     norm_data = normalize(data, norm='l2')
     z = hierarchy.ward(norm_data)
     return hierarchy.leaves_list(optimal_leaf_ordering(z, norm_data))
コード例 #16
0
def sample_from_corrgan(model_loc, dim=10, n_samples=1):
    # pylint: disable=import-outside-toplevel, disable=too-many-locals
    """
    Samples correlation matrices from the pre-trained CorrGAN network.

    It is reproduced with modifications from the following paper:
    `Marti, G., 2020, May. CorrGAN: Sampling Realistic Financial Correlation Matrices Using
    Generative Adversarial Networks. In ICASSP 2020-2020 IEEE International Conference on
    Acoustics, Speech and Signal Processing (ICASSP) (pp. 8459-8463). IEEE.
    <https://arxiv.org/pdf/1910.09504.pdf>`_

    It loads the appropriate CorrGAN model for the required dimension. Generates a matrix output
    from this network. Symmetries this matrix and finds the nearest correlation matrix
    that is positive semi-definite. Finally, it maximizes the sum of the similarities between
    adjacent leaves to arrange it with hierarchical clustering.

    The CorrGAN network was trained on the correlation profiles of the S&P 500 stocks. Therefore
    the output retains these properties. In addition, the final output retains the following
    6 stylized facts:

    1. Distribution of pairwise correlations is significantly shifted to the positive.

    2. Eigenvalues follow the Marchenko-Pastur distribution, but for a very large first
    eigenvalue (the market).

    3. Eigenvalues follow the Marchenko-Pastur distribution, but for a couple of other
    large eigenvalues (industries).

    4. Perron-Frobenius property (first eigenvector has positive entries).

    5. Hierarchical structure of correlations.

    6. Scale-free property of the corresponding Minimum Spanning Tree (MST).

    :param model_loc: (str) Location of folder containing CorrGAN models.
    :param dim: (int) Dimension of correlation matrix to sample.
        In the range [2, 200].
    :param n_samples: (int) Number of samples to generate.
    :return: (np.array) Sampled correlation matrices of shape (n_samples, dim, dim).
    """
    # Import here needed to prevent unnecessary imports in other parts of code.
    import tensorflow as tf

    # Validate dimension.
    if not (1 < dim <= 200):
        raise ValueError("Dimension not supported, {}".format(dim))

    # Resulting correlation matrices.
    nearest_corr_mats = []

    # Load generator model closest to the required dimension by looking at the models folder.
    dimension_from_folder = [
        int(f.split("_")[1][:-1]) for f in listdir(model_loc)
        if not path.isfile(path.join(model_loc, f))
    ]
    all_generator_dimensions = np.sort(dimension_from_folder)
    closest_dimension = next(
        filter(lambda i: i >= dim, all_generator_dimensions))

    # Load model.
    generator = tf.keras.models.load_model("{}/generator_{}d".format(
        model_loc, closest_dimension),
                                           compile=False)

    # Sample from generator. Input dimension based on network.
    noise_dim = generator.layers[0].input_shape[1]
    noise = tf.random.normal([n_samples, noise_dim])
    generated_mat = generator(noise, training=False)

    # Get the indices of an upper triangular matrix.
    tri_rows, tri_cols = np.triu_indices(dim, k=1)

    # For each sample generated, make them strict correlation matrices
    # by projecting them on the nearest correlation matrix using Higham’s
    # alternating projections method.
    for i in range(n_samples):
        # Grab only the required dimensions from generated matrix.
        corr_mat = np.array(generated_mat[i, :dim, :dim, 0])

        # Set diagonal to 1 and symmetrize.
        np.fill_diagonal(corr_mat, 1)
        corr_mat[tri_cols, tri_rows] = corr_mat[tri_rows, tri_cols]
        # Get nearest correlation matrix that is positive semi-definite.
        nearest_corr_mat = corr_nearest(corr_mat)

        # Set diagonal to 1 and symmetrize.
        np.fill_diagonal(nearest_corr_mat, 1)
        nearest_corr_mat[tri_cols, tri_rows] = nearest_corr_mat[tri_rows,
                                                                tri_cols]

        # Arrange with hierarchical clustering by maximizing the sum of the
        # similarities between adjacent leaves.
        dist = 1 - nearest_corr_mat
        linkage_mat = hierarchy.linkage(dist[tri_rows, tri_cols],
                                        method="ward")
        optimal_leaves = hierarchy.optimal_leaf_ordering(
            linkage_mat, dist[tri_rows, tri_cols])
        optimal_ordering = hierarchy.leaves_list(optimal_leaves)
        ordered_corr = nearest_corr_mat[optimal_ordering, :][:,
                                                             optimal_ordering]
        nearest_corr_mats.append(ordered_corr)

    return np.array(nearest_corr_mats)
コード例 #17
0
    def hierarchical(self,
                     axis,
                     phenotypes=(),
                     metric='correlation',
                     method='average',
                     log_features=False,
                     optimal_ordering=False):
        '''Hierarchical clustering.

        Args:
            axis (string): It must be 'samples' or 'features'. The
                Dataset.counts matrix is used and either samples or features
                are clustered.
            phenotypes (iterable of strings): Phenotypes to add to the
                features for joint clustering.
            metric (string or matrix): Metric to calculate the distance matrix.
                If it is a matrix already, use it as distance (squared). Else
                it should be a string accepted by scipy.spatial.distance.pdist.
            method (string): Clustering method. Must be a string accepted by
                scipy.cluster.hierarchy.linkage.
            log_features (bool): Whether to add pseudocounts and take a log
                of the feature counts before calculating distances.
            optimal_ordering (bool): Whether to resort the linkage so that
                nearest neighbours have shortest distance. This may take
                longer than the clustering itself.
        Returns:
            dict with the linkage, distance matrix, and ordering.
        '''
        from scipy.spatial.distance import pdist, squareform
        from scipy.cluster.hierarchy import linkage, leaves_list, optimal_leaf_ordering

        data = self.dataset.counts

        if log_features:
            data = np.log10(self.dataset.counts.pseudocount + data)

        if phenotypes is not None:
            data = data.copy()
            for pheno in phenotypes:
                data.loc[pheno] = self.dataset.samplesheet.loc[:, pheno]

        if axis == 'samples':
            data = data.T
        elif axis == 'features':
            pass
        else:
            raise ValueError('axis must be "samples" or "features"')

        if isinstance(metric, str):
            Y = pdist(data.values, metric=metric)
        else:
            Y = np.asarray(metric)
            assert len(Y.shape) == 2
            assert Y.shape[0] == Y.shape[1]
            assert Y.shape[0] == data.shape[0]
            Y = squareform(Y)

        # Some metrics (e.g. correlation) give nan whenever the matrix has no
        # variation, default this to zero distance (e.g. two features that are
        # both total dropouts.
        Y = np.nan_to_num(Y)

        Z = linkage(Y, method=method)

        if optimal_ordering:
            Z = optimal_leaf_ordering(Z, Y)

        ids = data.index[leaves_list(Z)]

        return {
            'distance': Y,
            'linkage': Z,
            'leaves': ids,
        }
コード例 #18
0
ファイル: _utils.py プロジェクト: AllonKleinLab/cospar
def heatmap(
        data_matrix,
        order_map_x=True,
        order_map_y=True,
        x_ticks=None,
        y_ticks=None,
        col_range=[0, 99],
        color_bar_label="",
        log_transform=False,
        color_map=plt.cm.Reds,
        vmin=None,
        vmax=None,
        fig_width=4,
        fig_height=6,
        color_bar=True,
        x_label=None,
        y_label=None,
        pseudo_count=10**(-10),
):
    """
    Plot ordered heat map of non-square data_matrix matrix

    Parameters
    ----------
    data_matrix: `np.array`
        The data matrix to be plotted
    order_map_x: `bool`
        Whether to re-order the x coordinate of the matrix or not
    order_map_y: `bool`
        Whether to re-order the y coordinate of the matrix or not
    x_ticks, y_ticks: `list`
        List of variable names for x and y ticks
    color_bar_label: `str`, optional (default: 'cov')
        Color bar label
    data_des: `str`, optional (default: '')
        String to distinguish different saved objects.
    log_transform: `bool`, optional (default: False)
        If true, perform a log transform. This is needed when the data
        matrix has entries varying by several order of magnitude.
    col_range: `tuple`, optional (default: None)
        The default setting is to plot the actual value of the vector.
        If col_range is set within [0,100], it will plot the percentile of the values,
        and the color_bar will show range [0,1]. This re-scaling is useful for
        visualizing gene expression.
    """

    from matplotlib.colors import Normalize as mpl_Normalize

    x_array = np.arange(data_matrix.shape[1])
    y_array = np.arange(data_matrix.shape[0])
    if order_map_x and (data_matrix.shape[1] > 2):
        if data_matrix.shape[0] != data_matrix.shape[1]:
            X = tl.get_normalized_covariance(data_matrix + pseudo_count,
                                             method="SW")
            Z = hierarchy.ward(X)
            order_x = hierarchy.leaves_list(
                hierarchy.optimal_leaf_ordering(Z, X))
        else:
            Z = hierarchy.ward(data_matrix + pseudo_count)
            order_x = hierarchy.leaves_list(
                hierarchy.optimal_leaf_ordering(Z, data_matrix + pseudo_count))
    else:
        order_x = x_array

    if order_map_y and (data_matrix.shape[0] > 2):
        if data_matrix.shape[0] != data_matrix.shape[1]:
            order_y = hf.get_hierch_order(data_matrix + pseudo_count)
        else:
            Z = hierarchy.ward(data_matrix + pseudo_count)
            order_y = hierarchy.leaves_list(
                hierarchy.optimal_leaf_ordering(Z, data_matrix + pseudo_count))
    else:
        order_y = y_array

    if log_transform:
        new_data = np.log(data_matrix[order_y][:, order_x] + 1) / np.log(10)
        label_ = " (log10)"
    else:
        new_data = data_matrix[order_y][:, order_x]
        label_ = ""

    col_data = new_data.flatten()
    if vmin is None:
        if col_range is None:
            vmin = np.min(col_data)
        else:
            vmin = np.percentile(col_data, col_range[0])

    if vmax is None:
        if col_range is None:
            vmax = np.max(col_data)
        else:
            vmax = np.percentile(col_data, col_range[1])
            if (vmax == 0) & (np.max(col_data) >= 1):
                vmax = 1
            if (vmax == 0) & (np.max(col_data) <= 1):
                vmax = np.max(col_data)

    fig, ax = plt.subplots()
    ax.imshow(new_data, aspect="auto", cmap=color_map, vmin=vmin, vmax=vmax)

    if x_ticks is None:
        plt.xticks([])
    else:
        plt.xticks(
            x_array + 0.4,
            np.array(x_ticks)[order_x],
            rotation=90,
            ha="right",
        )

    if y_ticks is None:
        plt.yticks([])
    else:
        plt.yticks(
            y_array + 0.4,
            np.array(y_ticks)[order_y],
            ha="right",
        )

    if x_label is not None:
        ax.set_xlabel(x_label)
    if y_label is not None:
        ax.set_ylabel(y_label)

    if color_bar:
        norm = mpl_Normalize(vmin=vmin, vmax=vmax)
        cbar = plt.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=color_map))
        cbar.set_label(f"{color_bar_label}{label_}", rotation=270, labelpad=20)
    plt.gcf().set_size_inches((fig_width, fig_height))
    return ax
コード例 #19
0
ファイル: utils_u.py プロジェクト: jude-hey/DeepTCR
def rad_plot(X_2,
             pairwise_distances,
             samples,
             labels,
             file_id,
             color_dict,
             self,
             gridsize=50,
             dg_radius=0.2,
             axes_radius=0.4,
             figsize=8,
             log_scale=False,
             linkage_method='complete',
             plot_type='hexbin',
             filename=None,
             sample_labels=False):

    n_s = len(np.unique(samples))
    clim = np.array([0, .1])
    d_max = np.max(X_2, axis=0)
    d_min = np.min(X_2, axis=0)
    c_center = (d_max + d_min) / 2
    c_radius = np.max(
        np.sqrt(np.sum(np.power(X_2 - c_center[np.newaxis, :], 2),
                       axis=1))) * 1.1
    c_pos = pol2cart(np.linspace(0, 2 * np.pi, 200),
                     c_radius) + c_center[np.newaxis, :]

    x_edges = np.linspace(d_min[0], d_max[0], gridsize)
    y_edges = np.linspace(d_min[1], d_max[1], gridsize)
    Y, X = np.meshgrid(x_edges[:-1] + (np.diff(x_edges) / 2),
                       y_edges[:-1] + (np.diff(y_edges) / 2))

    Z = optimal_leaf_ordering(
        linkage(pairwise_distances, method=linkage_method), pairwise_distances)
    dg_order = leaves_list(Z)

    fig = plt.figure(figsize=[figsize, figsize])
    axes_pos = pol2cart(np.linspace(0, 2 * np.pi, n_s + 1),
                        rho=axes_radius) + 0.5
    axes_size = axes_radius * np.sin(0.5 * (2 * np.pi / n_s))
    ax = [None] * n_s

    for i in range(n_s):
        ax[i] = fig.add_axes([
            axes_pos[i, 0] - axes_size, axes_pos[i, 1] - axes_size,
            2 * axes_size, 2 * axes_size
        ])
        ax[i].plot(c_pos[:, 0],
                   c_pos[:, 1],
                   '-',
                   linewidth=5.,
                   color=color_dict[labels[dg_order[i]]])
        if sample_labels:
            ax[i].text(.5,
                       0.2,
                       samples[dg_order[i]],
                       horizontalalignment='center',
                       transform=ax[i].transAxes)
        smp_d = X_2[file_id == samples[dg_order[i]], :]
        if plot_type is 'hexbin':
            ax[i].hexbin(smp_d[:, 0], smp_d[:, 1], gridsize=gridsize, mincnt=1)
        elif plot_type is '2dhist':
            h, _ = np.histogramdd(smp_d, [x_edges, y_edges])
            ax[i].pcolormesh(X,
                             Y,
                             h / np.sum(h),
                             shading='gouraud',
                             vmin=clim[0],
                             vmax=clim[1],
                             cmap='GnBu')
        else:
            ax[i].plot(smp_d, '.', markersize=1, alpha=0.5)
        ax[i].set(xticks=[], yticks=[], frame_on=False)
        #ax[i].set_title(samples[i])

    dg = dendrogram(Z, no_plot=True)
    polar_dendrogram(dg, fig, ax_radius=dg_radius, log_scale=log_scale)
    if filename is not None:
        plt.savefig(os.path.join(self.directory_results, filename))
コード例 #20
0
ファイル: matrix_plotting.py プロジェクト: zietAn/nilearn
def plot_matrix(mat,
                title=None,
                labels=None,
                figure=None,
                axes=None,
                colorbar=True,
                cmap=plt.cm.RdBu_r,
                tri='full',
                auto_fit=True,
                grid=False,
                reorder=False,
                **kwargs):
    """ Plot the given matrix.

        Parameters
        ----------
        mat : 2-D numpy array
            Matrix to be plotted.
        title : string or None, optional
            A text to add in the upper left corner.
        labels : list, ndarray of strings, empty list, False, or None, optional
            The label of each row and column. Needs to be the same
            length as rows/columns of mat. If False, None, or an
            empty list, no labels are plotted.
        figure : figure instance, figsize tuple, or None
            Sets the figure used. This argument can be either an existing
            figure, or a pair (width, height) that gives the size of a
            newly-created figure.
            Specifying both axes and figure is not allowed.
        axes : None or Axes, optional
            Axes instance to be plotted on. Creates a new one if None.
            Specifying both axes and figure is not allowed.
        colorbar : boolean, optional
            If True, an integrated colorbar is added.
        cmap : matplotlib colormap, optional
            The colormap for the matrix. Default is RdBu_r.
        tri : {'lower', 'diag', 'full'}, optional
            Which triangular part of the matrix to plot:
            'lower' is the lower part, 'diag' is the lower including
            diagonal, and 'full' is the full matrix.
        auto_fit : boolean, optional
            If auto_fit is True, the axes are dimensioned to give room
            for the labels. This assumes that the labels are resting
            against the bottom and left edges of the figure.
        grid : color or False, optional
            If not False, a grid is plotted to separate rows and columns
            using the given color.
        reorder : boolean or {'single', 'complete', 'average'}, optional
            If not False, reorders the matrix into blocks of clusters.
            Accepted linkage options for the clustering are 'single',
            'complete', and 'average'. True defaults to average linkage.

            .. note::
                This option is only available with SciPy >= 1.0.0.

            .. versionadded:: 0.4.1

        kwargs : extra keyword arguments
            Extra keyword arguments are sent to pylab.imshow

        Returns
        -------
        display : instance of matplotlib
            Axes image.
    """
    # we need a list so an empty one will be cast to False
    if isinstance(labels, np.ndarray):
        labels = labels.tolist()
    if labels and len(labels) != mat.shape[0]:
        raise ValueError("Length of labels unequal to length of matrix.")

    if reorder:
        if not labels:
            raise ValueError("Labels are needed to show the reordering.")
        try:
            from scipy.cluster.hierarchy import (linkage,
                                                 optimal_leaf_ordering,
                                                 leaves_list)
        except ImportError:
            raise ImportError("A scipy version of at least 1.0 is needed "
                              "for ordering the matrix with "
                              "optimal_leaf_ordering.")
        valid_reorder_args = [True, 'single', 'complete', 'average']
        if reorder not in valid_reorder_args:
            raise ValueError("Parameter reorder needs to be "
                             "one of {}.".format(valid_reorder_args))
        if reorder is True:
            reorder = 'average'
        linkage_matrix = linkage(mat, method=reorder)
        ordered_linkage = optimal_leaf_ordering(linkage_matrix, mat)
        index = leaves_list(ordered_linkage)
        # make sure labels is an ndarray and copy it
        labels = np.array(labels).copy()
        mat = mat.copy()
        # and reorder labels and matrix
        labels = labels[index].tolist()
        mat = mat[index, :][:, index]

    if tri == 'lower':
        mask = np.tri(mat.shape[0], k=-1, dtype=np.bool) ^ True
        mat = np.ma.masked_array(mat, mask)
    elif tri == 'diag':
        mask = np.tri(mat.shape[0], dtype=np.bool) ^ True
        mat = np.ma.masked_array(mat, mask)
    if axes is not None and figure is not None:
        raise ValueError("Parameters figure and axes cannot be specified "
                         "together. You gave 'figure=%s, axes=%s'" %
                         (figure, axes))
    if figure is not None:
        if isinstance(figure, plt.Figure):
            fig = figure
        else:
            fig = plt.figure(figsize=figure)
        axes = plt.gca()
        own_fig = True
    else:
        if axes is None:
            fig, axes = plt.subplots(1, 1, figsize=(7, 5))
            own_fig = True
        else:
            fig = axes.figure
            own_fig = False
    display = axes.imshow(mat,
                          aspect='equal',
                          interpolation='nearest',
                          cmap=cmap,
                          **kwargs)
    axes.set_autoscale_on(False)
    ymin, ymax = axes.get_ylim()
    if not labels:
        axes.xaxis.set_major_formatter(plt.NullFormatter())
        axes.yaxis.set_major_formatter(plt.NullFormatter())
    else:
        axes.set_xticks(np.arange(len(labels)))
        axes.set_xticklabels(labels, size='x-small')
        for label in axes.get_xticklabels():
            label.set_ha('right')
            label.set_rotation(50)
        axes.set_yticks(np.arange(len(labels)))
        axes.set_yticklabels(labels, size='x-small')
        for label in axes.get_yticklabels():
            label.set_ha('right')
            label.set_va('top')
            label.set_rotation(10)

    if grid is not False:
        size = len(mat)
        # Different grids for different layouts
        if tri == 'lower':
            for i in range(size):
                # Correct for weird mis-sizing
                i = 1.001 * i
                axes.plot([i + 0.5, i + 0.5], [size - 0.5, i + 0.5],
                          color='grey')
                axes.plot([i + 0.5, -0.5], [i + 0.5, i + 0.5], color='grey')
        elif tri == 'diag':
            for i in range(size):
                # Correct for weird mis-sizing
                i = 1.001 * i
                axes.plot([i + 0.5, i + 0.5], [size - 0.5, i - 0.5],
                          color='grey')
                axes.plot([i + 0.5, -0.5], [i - 0.5, i - 0.5], color='grey')
        else:
            for i in range(size):
                # Correct for weird mis-sizing
                i = 1.001 * i
                axes.plot([i + 0.5, i + 0.5], [size - 0.5, -0.5], color='grey')
                axes.plot([size - 0.5, -0.5], [i + 0.5, i + 0.5], color='grey')

    axes.set_ylim(ymin, ymax)

    if auto_fit:
        if labels:
            fit_axes(axes)
        elif own_fig:
            plt.tight_layout(pad=.1,
                             rect=((0, 0, .95, 1) if colorbar else
                                   (0, 0, 1, 1)))

    if colorbar:
        cax, kw = make_axes(axes,
                            location='right',
                            fraction=0.05,
                            shrink=0.8,
                            pad=.0)
        fig.colorbar(mappable=display, cax=cax)
        # make some room
        fig.subplots_adjust(right=0.8)
        # change current axis back to matrix
        plt.sca(axes)

    if title is not None:
        # Adjust the size
        text_len = np.max([len(t) for t in title.split('\n')])
        size = axes.bbox.size[0] / text_len
        axes.text(0.95,
                  0.95,
                  title,
                  horizontalalignment='right',
                  verticalalignment='top',
                  transform=axes.transAxes,
                  size=size)

    return display
コード例 #21
0
def HDBSCAN(l2fcs,
            axis,
            min_cluster_size=2,
            min_samples=1,
            cluster_selection_epsilon=0.2,
            metric='cosine',
            cluster_selection_method='eom',
           ):

    # cosine_distances wants samples to be rows and features to be columns
    if axis == 'guides':
        to_cluster = l2fcs.T
    elif axis == 'outcomes':
        to_cluster = l2fcs
    else:
        raise ValueError(axis)

    if metric == 'cosine':
        distances = sklearn.metrics.pairwise.cosine_distances(to_cluster)
    elif metric == 'correlation':
        distances = 1 - to_cluster.T.corr()
    elif metric == 'euclidean':
        distances = ssd.squareform(ssd.pdist(to_cluster))
    else:
        distances = None

    labels = list(to_cluster.index.values)

    distances = pd.DataFrame(distances, index=labels, columns=labels)

    clusterer = hdbscan.HDBSCAN(metric='precomputed',
                                min_cluster_size=min_cluster_size,
                                min_samples=min_samples,
                                cluster_selection_epsilon=cluster_selection_epsilon,
                                cluster_selection_method=cluster_selection_method,
                               )
    clusterer.fit(distances)

    linkage = clusterer.single_linkage_tree_.to_numpy()
    linkage = sch.optimal_leaf_ordering(linkage, ssd.squareform(distances))
    dendro = sch.dendrogram(linkage,
                            no_plot=True,
                            labels=labels,
                           )

    clustered_order = dendro['ivl']
    cluster_ids = clusterer.labels_

    # Transform from original order into the order produced by dendrogram.
    cluster_assignments = [cluster_ids[labels.index(l)] for l in clustered_order]

    if axis == 'guides':
        l2fcs_reordered = l2fcs.loc[:, clustered_order]
    elif axis == 'outcomes':
        l2fcs_reordered = l2fcs.loc[clustered_order, :].T
    else:
        raise ValueError(axis)

    if metric == 'correlation':
        similarities = l2fcs_reordered.corr()
    elif metric == 'cosine':
        similarities = sklearn.metrics.pairwise.cosine_similarity(l2fcs_reordered.T)
    elif metric == 'euclidean':
        similarities = 1 / (1 + ssd.squareform(ssd.pdist(l2fcs_reordered.T)))
    else:
        similarities = None

    results = {
        'clustered_order': clustered_order,
        'cluster_assignments': cluster_assignments,

        'distances': distances.loc[clustered_order, clustered_order],
        'similarities': similarities,
        'linkage': linkage,
        'original_order': labels,

        'clusterer': clusterer,
    }

    return results
コード例 #22
0
ファイル: 9.0.baselines.py プロジェクト: liusida/ds2_arxiv
elements = elements[:, :, 0]
rng = default_rng(seed=1)
i = rng.permutation(np.arange(elements.shape[0]))
# j = rng.permutation(np.arange(elements.shape[0]))
elements = elements[i]
elements = elements[:, i]

elements = np.load("shared/author_similarity_matrix.npy")
print(elements.shape)

save_pic(elements, "randomized")

for i in range(1):
    Z = hierarchy.ward(elements)
    indices = hierarchy.leaves_list(
        hierarchy.optimal_leaf_ordering(Z, elements))

    elements = elements[indices].T
    elements = elements[indices].T

    save_pic(elements, f"processed_{i}_olo")

    pca = PCA(n_components=1)
    pca.fit(elements)
    print(pca.components_.shape)
    indices = np.argsort(pca.components_.flatten())

    elements = elements[indices].T
    elements = elements[indices].T

    save_pic(elements, f"processed_{i}_pca")