Exemple #1
 def cluster_groups(self):
     Returns a dictionary where keys are cluster labels and
     the values are all items under the key cluster.
     C = self._children
     C_groups = {}
     for k in C.keys():
         if len(C[k]) > 0:
             final_items = list(C[k])
             done = False
             while not (done):
                 new_final_items = []
                 done = True
                 for l in final_items:
                     if len(C[l]) > 1:
                         done = False
                         new_final_items = new_final_items + list(C[l])
                         new_final_items = new_final_items + [l]
                 final_items = new_final_items
             C_groups[self.clusters[k]] = _Group(
                 self.items[_np.array(final_items)], superset=self.items)
             C_groups[self.clusters[k]] = _Group(_np.array([self.items[k]]),
     return C_groups
def scatter3D(x, y, z, agg=None, layout=None, show_items=None, **kwargs):
    Generates a 3-dimensional scatter plot of given coordinate vectors; optionally plots them on separate traces based on an aggregation.

    x :             The x-coordinates of the data points.

    y :             The y-coordinates of the data points.

    z :             The z-coordinates of the data points.

    Keyword Arguments
    agg :           An Aggregation of the indices of x, y and z.

    show_items :    A one-dimensional array of which indices of x, y and z are to be shown.

    layout :        A dictionary for updating values for the Plotly Figure layout.

    **kwargs :      Keyword arguments for the Plotly Scatter3d trace.
                    If an attribute is given as a single string or float, will be applied to all data points. 
                    If as an array of length x.shape[0], will be applied separately to each data point.
                    If an an array of length agg.clusters.size, will be applied separately to each cluster.

    fig :           A Plotly Figure containing the scatter plot.
    if agg is None:
        agg = _Aggregation(_Group(_np.arange(x.shape[0])),
                           _Group(_np.array([0])), {0: _np.arange(x.shape[0])})
    specific_keywords = [{} for i in range(agg.clusters.size)]
    for k, v in kwargs.items():
        if hasattr(v, '__len__') and not (isinstance(v, str)):
            if len(v) == len(agg.clusters):
                for i in range(agg.clusters.size):
                    specific_keywords[i][k] = v[i]
            elif len(v) == len(agg.items):
                for i in range(agg.clusters.size):
                    specific_keywords[i][k] = v[agg._aggregations[i]]
            for i in range(agg.clusters.size):
                specific_keywords[i][k] = v
    if kwargs.get('name', None) is None:
        for i in range(agg.clusters.size):
            specific_keywords[i]['name'] = str(agg.clusters.elements[i])
    fig = _go.Figure(data=[
        for i in range(agg.clusters.size)
    if layout is not None:
    return fig
Exemple #3
 def at_scale(self, scale):
     Returns the aggregation corresponding to the coarsest
     partition made from clusters not exceeding the given scale.
     C_tree = self._children
     C_less_than = {
         k: C_tree[k]
         for k in C_tree.keys() if self._scales[k] <= scale
     C_top = {
         k: C_less_than[k]
         for k in C_less_than.keys() if is_canopy(k, C_less_than)
     C_groups = self.cluster_groups()
     C_top_lists = {
         k: C_groups[self.clusters[k]].in_superset
         for k in C_top.keys()
     C_top_group = _Group(self.clusters.elements[_np.array(
     return _Aggregation(
         self.items, C_top_group, {
             C_top_group.ind[self.clusters.elements[k]]: C_top_lists[k]
             for k in C_top_lists.keys()
def split_by_gaps(vec,num_gaps = 1,group = None):
    Aggregates the indices of a vector based on gaps between index values.
    The number of gaps is specified by num_gaps,
    and the largest num_gaps gaps in the sorted array
    are used to cluster values.

    vec :       A one-dimensional array of values.

    Keyword Arguments
    num_gaps :  The number of gaps to use to break vec into clusters.

    group :     The group which labels the indices of vec, and which will be the item set of the returned Aggregation.

    Aggregation of the indices of vec
    if group is None:
        group = _Group(_np.arange(len(vec)))

    sort_inds = _np.argsort(vec)

    gap_locs = _np.flip(

    ordered_gaps = _np.sort(gap_locs)

    agg_dict = {
        for k in range(0,num_gaps-1)
    return _Aggregation(group,_Group(_np.arange(num_gaps+1)),agg_dict)
Exemple #5
 def cluster_children(self):
     Returns a dictionary where keys are cluster labels and
     the values are the immediate child clusters.
     return {
         self.clusters[k]: _Group(self.clusters.elements[self._children[k]],
         for k in self._children.keys()
 def as_dict(self):
     Returns a dictionary whose keys are from self.clusters
     and whose values are Groups corresponding to said clusters.
     return {
         for k in self._aggregations.keys()
Exemple #7
 def clusters_containing(self, items_list):
     Returns a Group containing all cluster labels for clusters
     that contain the given items.
     C_groups = self.cluster_groups()
     containing_groups = []
     for key, grp in C_groups.items():
         if _np.all(_np.array([(i in grp) for i in items_list])):
     return _Group(_np.array(containing_groups), superset=self.clusters)
Exemple #8
 def join(self, cluster_list):
     Returns the smallest cluster that is a supercluster of all given clusters.
     C_groups = self.cluster_groups()
     items = list(
         _reduce(lambda x, y: x + y, [C_groups[c] for c in cluster_list],
     rivals = self.clusters_containing(items)
     lens = _np.array(
         [len(self.cluster_groups()[c]) for c in rivals.elements])
     return rivals[_np.argmin(lens)]
def split_by_quantiles(vec,quantiles=0.95,group = None):
    Like split_by_vals, but cuts the vector at specific quantiles
    rather than rigid values. Assumes right-continuity of the 
    cumulative distribution function.

    vec :       A one-dimensional array of values.

    Keyword Arguments
    quantiles : A single value or list/array of quantiles which will be used to divide the vector components.

    group :     The group which labels the indices of vec, and which will be the item set of the returned Aggregation.

    Aggregation of the indices of vec
    num = len(vec)
    if group is None:
        group = _Group(_np.arange(len(vec)))

    if not(isinstance(quantiles,_np.ndarray)):
        if isinstance(quantiles,list):
            quantiles = _np.array(quantiles)
            quantiles = _np.array([quantiles])

    cdf = _np.sum(vec[:,None] <= vec[None,:],axis=0)/len(vec)
    where = quantiles[:,None]<= cdf[None,:]
    cuts = _np.amin(

    return split_by_vals(vec,cuts=cuts,group=group)
 def __getitem__(self, key):
     return _Group(
def hier_from_blocks(block_mats,scales=None,group=None):
    Given a parameterized ensemble of block matrices, each more coarse-grained than the last,
    constructs a corresponding Hierarchy object.

    block_mats :    A three-dimensional array. The first dimension is the ensemble dimension, and the remaining two dimensions are equal.

    Keyword Arguments
    scales :        A one-dimensional monotonically increasing array, giving a scale parameter for each ensemble

    group :         The group which labels the indices of block_mats.shape[1], and which will be the item set of the returned Aggregation.

    Hierarchy of the indices of block_mats.shape[1]
    if group is None:
        group = _Group(_np.arange(block_mats.shape[1]))
    if scales is None:
        scales = _np.arange(block_mats.shape[0])

    current_agg = _Aggregation(
        {j:_np.array([j]) for j in range(group.size)}
    new_block_mats = block_mats.copy()
    cluster_children = {}
    proper_clusters = 0
    for j in range(block_mats.shape[0]):
        st_mat = _utils.stoch(new_block_mats[j])
        agg = shi_malik(
        new_agg_dict = {}
        new_agg_names = []
        num_clusters = 0
        for k,c in agg:
            if c.size>1:
                cluster_children[proper_clusters+group.size] = (
                new_agg_dict[num_clusters] = c.in_superset
                proper_clusters += 1
                num_clusters += 1
                new_agg_dict[num_clusters] = c.in_superset
                num_clusters += 1
        new_agg = _Aggregation(

        blocks = [
            for k in range(new_agg.clusters.size)
        new_block_mats = (
            ) > 0
        current_agg = new_agg

    labels = _Group(_np.array(

    return _Hierarchy(
def fushing_mcassey(st_mat,max_visits=5,time_quantile_cutoff=0.95,group=None):
    Given a square stochastic matrix describing the strength
    of the relationship between pairs of items,
    determines an aggregation of the items using
    the regulated random walk approach of Fushing and McAssey.
    The algorithm is inherently random
    and highly unstable as a single-shot approach,
    but may be used in an ensemble to determine a 
    useful similarity matrix.

    Suppose st_mat is given by the Markov matrix T.
    A regulated random walk is taken using T as the initial
    transition probabilities, and modifying these probabilities
    to remove from circulation any node which has been visited
    at least max_visits times (this prevents the walk from
    being stuck in a cluster for too long). The time between removals
    is recorded; the highest values (determined by time_quantile_cutoff)
    determine the number of clusters (it is interpreted that a sudden long
    removal time after many short removal times indicates 
    one has left a highly-explored cluster and entered an unexplored one).

    A node which was removed and for which >50% of its visits
    prior to removal were in particular time-interval is placed in the cluster
    associated with that time interval; all other nodes remain unclustered.

    This algorithm will not return useful results after a single run,
    but if an ensemble of runs is collected it may be used to
    derive a similarity matrix, based on how often two nodes are in
    a cluster together over the many runs.

    st_mat :                A square stochastic matrix describing a Markov dynamic.

    Keyword Arguments
    max_visits :            The maximum number of visits to a node before it is removed in the regulated random walk.

    time_quantile_cutoff :  The quantile of the length of time between node removals, which is used to determine the number of clusters.

    group :                 The group which labels the indices of st_mat, and which will be the item set of the returned Aggregation.

    Aggregation of the indices of st_mat
    if group is None:
        group = _Group(_np.arange(st_mat.shape[0]))

    reg = lambda t,ps,an,nd: _regulators.node_removal_regulator(

    hlt = lambda t,an,nd: _regulators.halt_when_explored(

    reports, path = _simulation.markov_random_walk(

    if len(reports)==0:
        return _Aggregation(
                for j in _np.arange(group.size)
        times = _np.concatenate([

        clust_times = reports[

        if clust_times[0]>0:
            clust_times = _np.concatenate([_np.array([0]),clust_times])
        clusters = {
            for j in range(len(clust_times))
        for k in group:
            if k in _np.unique(reports[:,0]):
                block_counts = _np.add.reduceat((path==k),clust_times)
                props = block_counts/_np.sum(block_counts)
                if _np.any(props>0.5):
                    clusters[group.ind[k]] = [group.ind[k]]
                clusters[group.ind[k]] = [group.ind[k]]

        cluster_names = _np.array(
            [k for k in clusters.keys() if len(clusters[k])>0]

        agg_dict = {
            for j in range(len(cluster_names))

        return _Aggregation(
def shi_malik(st_mat,eig_thresh=0.95,cut=0,group=None):
    Given a stochastic matrix describing the strength
    of the relationship between pairs of items,
    determines an aggregation of the items using
    the spectral approach of Shi and Malik.

    A column-stochastic matrix T will always have a leading
    eigenvalue of 1 and a leading uniform right-eigenvector, 
    u=(1,...,1), which is a fixed point of the map:

    T u = u

    If T has no disconnected components then u is the
    unique fixed point (up to a constant scaling) 
    and the sub-leading eigenvalue
    is strictly less than one; otherwise, the eigenvalue
    1 is degenerate. In the first case, if the sub-leading
    eigenvalue is close to 1, then the sub-leading
    right-eigenvector y may be used to partition the indices into
    two slowly-decaying communities.

    The Shi-Malik algorithm is recursive, taking
    the sub-leading eigenvector of T (as long as the
    corresponding eigenvalue is above a threshold),
    using it to bipartition the indices, and then
    repeating these steps on the partitions with a reweighted
    matrix. This implementation cuts the vector y by value,
    by default into components y>0 and y<=0, because of the
    orthogonality relationship

    <y>_pi = y . pi = 0

    which indicates that the mean value of y
    under the stationary distribution pi 
    (left-eigenvector of T)
    must always be zero, making this a value of significance.

    The algorithm halts when no community has a sub-leading
    eigenvector above the threshold, and the final partitioning
    is returned as an Aggregation.

    st_mat :        A square stochastic matrix describing a Markov dynamic.

    Keyword Arguments
    eig_thresh :    The smallest value the subleading eigenvalue may have to continue the recursion.

    cut :           The value used to "cut" the subleading eigenvector into two clusters.

    group :         The group which labels the indices of st_mat, and which will be the item set of the returned Aggregation.

    Aggregation of the indices of st_mat
    if group is None:
        group = _Group(_np.arange(st_mat.shape[0]))

    num_items = group.size
    clusts = _Aggregation(
    change = True
    while change:
        new_clusts = []
        change = False
        for k,c in clusts:
            if len(c)>1:
                T = _utils.stoch(st_mat[
                eigs,evecs = _la.eig(T)
                einds = _np.flip(_np.argsort(_np.abs(eigs)))
                if eigs[einds[1]]>eig_thresh:
                    y = _np.real(evecs[:,einds[1]])
                    ind_agg = split_by_vals(y/_np.sum(y),group=c,cuts=cut)
                    if ind_agg.clusters.size>1:
                        ind_agg = split_by_gaps(y,group=c)
                    change = True

        new_agg = {j:new_clusts[j] for j in range(len(new_clusts))}
        clusts = _Aggregation(
    return clusts
def meyer_wessell(st_mat,min_times_same = 5,vector_clustering = None,group = None):
    Given a column-stochastic matrix describing the strength
    of the relationship between pairs of items,
    determines an aggregation of the items using the dynamical
    approach of Meyer and Wessell. The algorithm
    is inherently random, though fairly stable, and so may
    be used as a one-shot measure but will be more reliable
    in an ensemble.

    A column-stochastic matrix T will, by the Perron-Frobenius theorem,
    have a uniform vector u = (1,...,1) as a fixed point, that is:

    T u = u
    This fixed point is unique as long as 
    the stochastic matrix is not reducible
    into disconnected components. If it is almost reducible
    (that is, if there are strongly connected communities with
    weak connections between them), the vector T^t u will achieve
    uniformity among the connected components before achieving global

    The Meyer-Wessell approach relies on applying the
    column-stochastic matrix T to a random initial vector x and
    detecting communities by identifying clusters of components which
    achieve uniformity long before global uniformity is reached.
    This is done by iteratively applying T to x and, at each iteration,
    performing some kind of vector clustering on T^t x. When
    the resulting Aggregation ceases to change ove a long
    enough number of iterations, it is returned as the final Aggregation.

    st_mat :            A square bistochastic matrix.

    Keyword Arguments
    min_times_same :    The number of iterations after which, if the clustering has not changed, the algorithm halts.

    vector_clustering : The particular method of vector clustering which should be used in the algorithm.
                        Should receive a vector as the sole i_nput and return an Aggregation.

    group :             The group which labels the indices of st_mat, and which will be the item set of the returned Aggregation.

    Aggregation of the indices of st_mat
    method = vector_clustering
    if group is None:
        group = _Group(_np.arange(st_mat.shape[0]))

    if method is None:
        eigs,vecs = _la.eig(st_mat)
        eig_agg = split_by_gaps(eigs)
        k = len(eig_agg[1])
        if k>1:
            method = lambda v:split_by_gaps(
                v,num_gaps = k-1,group = group
            method = lambda v:_Aggregation(

    x = _np.random.rand(st_mat.shape[0])
    times_same = 1
    new_agg = method(x)
    by_cluster = method(x).by_cluster()
    while times_same < min_times_same:
        x = st_mat@x
        new_agg = method(x)
        new_by_cluster = new_agg.by_cluster()
        if _np.all(new_by_cluster == by_cluster):
            times_same += 1
            times_same = 1
        by_cluster = new_by_cluster

    return new_agg
def split_by_vals(vec,cuts=0,group = None,tol=0):
    Aggregates the indices of a vector based on specified
    values at which to cut the sorted array. Assumes the
    right-continuity of the cumulative distribution function.

    vec :   A one-dimensional array of values.

    Keyword Arguments
    cuts :  A single value or list/array of values which will be used to divide the vector components.

    group : The group which labels the indices of vec, and which will be the item set of the returned Aggregation.

    Aggregation of the indices of vec
    if group is None:
        group = _Group(_np.arange(len(vec)))

    if not(isinstance(cuts,_np.ndarray)):
        if isinstance(cuts,list):
            cuts = _np.array(cuts)
            cuts = _np.array([cuts])
    cuts = cuts[

    if len(cuts)==0:
        return _Aggregation(
        agg_dict = {
                    vec > cuts[k],
                    vec <= cuts[k+1]
            for k in range(0,len(cuts)-1)
            0:_np.where(vec <= cuts[0])[0],
            len(cuts):_np.where(vec > cuts[len(cuts)-1])[0]

        return _Aggregation(
def markov_random_walk(probs,
    Given a set of transition probabilities, generates a random walk
    through the available nodes. The initial node, if not specified,
    is randomly selected. The method returns two items.
    The first is the report array R,
    dimensions Mx2, where the reports are indexed by the axis M.
    R[i,0] is the content of the report and R[i,1] is the time of the report.
    The second is the path P, an N-dimensional vector, where N is the number of steps
    in the random walk and P[j] is the node at time j.

    The walk may be regulated. This involves passing a regulator,
    which is a function that takes the simulation time, 
    the transition probabilities, the current node, 
    and an array of node data. At each time, 
    the regulator returns a Boolean indicating
    whether a report is to be made, the content of the report,
    and a new set of transition probabilities determined by
    the available information. The regulator also
    updates node_data in-place.

    Lastly, one can either specify a maximum length
    of time for the walk (using the keyword max_time) 
    or a more general halting condition (using the keyword halt).
    A halting condition takes the time, current node, and node data.
    If neither are specified, the maximum length will
    be set to 100 steps.


    probs :         A square Markov matrix indicating the 
                    transition probabilities for the walk.

    initial :       The initial node. If group is not None, 
                    then the type of initial should be the 
                    type of the elements of group. Otherwise, 
                    initial should be the index of the initial node. 
                    If not specified, a random node will be chosen.

    group :         A Group whose elements label the indices of probs. 
                    If specified, inputs like initial and outputs 
                    like the path refer to nodes by their labels. 
                    If not specified, nodes will be referred to 
                    in inputs and outputs by their index.

    regulator :     The regulator determines how the probability matrix 
                    will be modified over the course of the walk, 
                    and what events will be noted in reports. 
                    See stoclust.regulators for more details. 
                    If not specified, a trivial regulator will be used 
                    which never modifies the transition matrix 
                    and returns no reports.

    halt :          The halt condition determines under what conditions 
                    the walk should stop. See the stoclust.regulators 
                    for more details. If not specified, a trivial halt 
                    condition to stop after a specified number of steps 
                    will be used; the number of steps can be changed 
                    using the max_time argument.

    max_time :      If a halt condition is not specified, then the walk 
                    is halted automatically after max_time steps. 
                    The default is set to 100.
    if group is None:
        group = _Group(_np.arange(probs.shape[0]))

    if regulator is None:
        regulator = lambda t, ps, an, nd: (False, None, ps)

    if halt is None:
        halt = lambda t, an, nd: _regulators.halt_after_time(
            t, an, nd, max_time=max_time)

    if initial is None:
        initial_ind = _np.random.choice(_np.arange(probs.shape[0]))
        initial_ind = group.ind[initial]

    reports = []
    locations = []
    node_data = _np.zeros([probs.shape[0]])

    t = 0
    current = initial_ind
    will_report, report, new_probs = regulator(t, probs, current, node_data)
    if will_report:
        reports.append([report, t])
    t += 1

    while not (halt(t, current, node_data)):
        if _np.sum(new_probs[current, :]) < tol:
            remaining = _np.where(_np.sum(new_probs, axis=0) > tol)[0]
            sequel = _np.random.choice(remaining)
            sequel = _np.random.choice(_np.arange(probs.shape[0]),
                                       p=new_probs[current, :])

        will_report, report, new_probs = regulator(t, probs, sequel, node_data)
        if will_report:
            reports.append([report, t])

        current = sequel
        t += 1

    return _np.array(reports), _np.array(locations)