Beispiel #1
0
 def fast_min(state, beta, n_sweep, fast_tol, seed=None):
     if seed:
         gt.seed_rng(seed)
     dS = 1
     while np.abs(dS) > fast_tol:
         dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep)
     return state
Beispiel #2
0
 def __init__(self, n, seed_number=None, directed=False):
     self.n = n
     self.seed_number = seed_number
     self.directed = directed
     if seed_number:
         gt.seed_rng(self.seed_number)
         seed(self.seed_number)
Beispiel #3
0
	def _generate_graph(self):
		np.random.seed(self.seed)
		gt.seed_rng(self.seed)
		if 'block_membership' in self.params and self.params['block_membership'] != None:
			self.graph, self.bm = self.generator(**self.params) 
		else:
			self.graph = self.generator(**self.params) 
		self.n_nodes = self.graph.num_vertices()
Beispiel #4
0
def draw_gviz(node_dict, size_multiple=50, random_seed=42, **kwargs):
    """ Draw clonal network using graph-tool

    More information: graphtool edge / vertex parameters and examples:
        https://graph-tool.skewed.de/static/doc/draw.html#graph_tool.draw.graph_draw
        http://ryancompton.net/2014/10/05/graph-tools-visualization-is-pretty-good/

    Args:
        node_dict (dict): nested dictionary of node properties
            Generate this using df_generate_node_dict()
        size_multiple (int): scaling factor for node size (for convenience)
        **kwargs: keyword arguments passed to gt.graph-draw()
            e.g. output='file.pdf', layout='neato', output_size=(300,300)
    """
    import graph_tool.all as gt

    g = gt.Graph()
    vsizes = g.new_vertex_property("int")
    vcolors = g.new_vertex_property('string')
    vshapes = g.new_vertex_property('string')
    vpenwidth = g.new_vertex_property("float")  # stroke

    for node_id, node_props in node_dict.items():
        g.add_vertex()

        vshapes[g.vertex(node_id)] = node_props['shape']
        vcolors[g.vertex(node_id)] = node_props['color']
        vsizes[g.vertex(node_id)] = node_props['size'] * size_multiple
        vpenwidth[g.vertex(node_id)] = node_props['stroke']

        # add edge to ancestor
        if node_props['ancestor'] is not None:
            g.add_edge(node_props['ancestor'], node_id)

    # seeds enable graph reproduction
    seed(random_seed)
    gt.seed_rng(random_seed)

    gt.graph_draw(
        g,
        vertex_size=vsizes,
        vertex_fill_color=vcolors,
        vertex_shape=vshapes,
        vertex_pen_width=vpenwidth,
        vertex_color='k',  # stroke color
        bg_color=[1, 1, 1, 1],  # white
        edge_end_marker='none',
        **kwargs)
Beispiel #5
0
    def fit(self):
        """
        Fits the hSBM to the undirected, layered multigraph, where the graph in the doc-word layer is bipartite.
        This uses the independent layer multilayer network where we have a degree-corrected SBM.
        """
        # We need to impose constraints on vertices and edges to keep track which layer are they in.
        state_args = {}
        # Vertices with different label values will not be clustered in the same group
        state_args["pclabel"] = self.g.vp["kind"]
        # Split the network in discrete layers based on edgetype. 0 is for word-doc graph and 1 is for hyperlink graph.
        state_args["ec"] = self.g.ep["edgeType"]
        # Independent layers version of the model (instead of 'edge covariates')
        state_args["layers"] = True
        # Edge multiplicities based on occurrences.
        state_args["eweight"] = self.g.ep.edgeCount

        self.g.save("foo.gt.gz")
        # Specify parameters for community detection inference
        gt.seed_rng(self.random_seed)
        mdl = np.inf
        # Fit n_init random initializations to avoid local optimum of MDL.
        for _ in range(self.n_init):
            # Enables the use of LayeredBlockState. Use a degree-corrected layered SBM.
            state_temp = gt.minimize_nested_blockmodel_dl(self.g, state_args=dict(base_type=gt.LayeredBlockState,
                                                                                  **state_args))
            mdl_temp = state_temp.entropy()
            if mdl_temp < mdl:
                # We have found a new optimum
                mdl = mdl_temp
                state = state_temp.copy()

        self.state = state
        self.mdl = state.entropy()

        n_levels  = len(self.state.levels)
        # Figure out group levels
        if n_levels == 2:
            # Bipartite network
            self.groups = { 0: self.get_groupStats(l=0) }
            self.n_levels = len(self.groups)
        # Omit trivial levels: l=L-1 (single group), l=L-2 (bipartite)
        else:
            self.groups = { level: self.get_groupStats(l=level) for level in range(n_levels - 2) }
            self.n_levels = len(self.groups)
Beispiel #6
0
def set_seed(seed: int) -> None:
    """Set random seed for reproducibility.

    Take care of python random library and numpy.

    Args:
        seed (int): the value choosen as seed for the random generators
    """
    global SEED
    SEED = seed
    random.seed(SEED)
    np.random.seed(SEED)
    da.random.seed(SEED)
    try:
        import graph_tool.all as gt

        gt.seed_rng(SEED)
    except ImportError:
        pass
def test1():
    gt.seed_rng(42)
    seed(42)
    NUMBER_OF_NODES = 20
    points = random((NUMBER_OF_NODES, 2))
    points[0] = [0, 0]
    points[1] = [1, 1]
    g, pos = gt.triangulation(points, type="delaunay")
    g.set_directed(True)
    edges = list(g.edges())
    # reciprocate edges
    for e in edges:
       g.add_edge(e.target(), e.source())
    # The capacity will be defined as the inverse euclidean distance
    cap = g.new_edge_property("double")
    for e in g.edges():
        cap[e] = min(1.0 / norm(pos[e.target()].a - pos[e.source()].a), 10)
    g.edge_properties["cap"] = cap
    g.vertex_properties["pos"] = pos
    g.save("flow-example.xml.gz")
    gt.graph_draw(g, pos=pos, edge_pen_width=gt.prop_to_size(cap, mi=0, ma=3, power=1),
                  output="flow-example.pdf")
def generate_graph():
    """
    brew tap homebrew/science
    brew install graph-tool
    """

    from graph_tool.all import price_network, sfdp_layout, graph_draw
    from graph_tool.all import dfs_search, DFSVisitor, seed_rng
    from numpy.random import seed

    class AnnotationVisitor(DFSVisitor):
        def __init__(self, pred, dist):
            self.pred = pred
            self.dist = dist
            self.roots = {}

        def tree_edge(self, e):
            depth = self.dist[e.source()]
            if depth == 1:
                genre = int(e.source())
                if genre not in self.roots:
                    self.roots[genre] = len(self.roots)
            else:
                genre = self.pred[e.source()]
            self.pred[e.target()] = genre
            self.dist[e.target()] = depth + 1

    # For run-to-run stability, provide a constant seed:
    seed(SEED)
    seed_rng(SEED)

    print 'Generating graph...'
    g = price_network(2000)

    print 'Performing layout...'
    pos = sfdp_layout(g)

    print 'Adding depths...'
    dist = g.new_vertex_property("int")
    pred = g.new_vertex_property("int64_t")
    g.set_directed(False)
    visitor = AnnotationVisitor(pred, dist)
    dfs_search(g, g.vertex(0), visitor)

    print 'Iterating over verts...'
    flattened = []
    maxp = [-9999, -9999]
    minp = [+9999, +9999]
    maxd = 0
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, z = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        minp[0] = min(minp[0], x)
        minp[1] = min(minp[1], y)
        maxp[0] = max(maxp[0], x)
        maxp[1] = max(maxp[1], y)
        maxd = max(maxd, dist.a[v])
        flattened += [x, y, z]

    print 'max depth is', maxd
    print 'nroots is', len(visitor.roots)
    print 'ncolors is', len(COLORS)

    extent = (maxp[0] - minp[0], maxp[1] - minp[1])
    padding = extent[0] * PADDING_FRACTION
    minp[0] -= padding
    minp[1] -= padding
    maxp[0] += padding
    maxp[1] += padding
    scale = [
        1.0 / (maxp[0] - minp[0]),
        1.0 / (maxp[1] - minp[1])]
    scale = min(scale[0], scale[1])
    midp = [
        0.5 * (maxp[0] + minp[0]),
        0.5 * (maxp[1] + minp[1])]
    flatarray = []
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, root = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        x = (0.5 + (x - midp[0]) * scale)
        y = (0.5 + (y - midp[1]) * scale)
        prom = int(dist.a[v])
        flatarray += [x, y, root, prom]
    return flatarray
Beispiel #9
0
# * node (gene) degree in the eADAGE graph
# * edge weight in the eADAGE graph
# * [betweenness centrality](https://en.wikipedia.org/wiki/Betweenness_centrality) of generic vs. non-generic genes
# * [PageRank](https://en.wikipedia.org/wiki/PageRank) (sometimes called PageRank centrality) of generic vs. non-generic genes, specifically the [undirected version](https://en.wikipedia.org/wiki/PageRank#PageRank_of_an_undirected_graph).

# In[1]:

import os

import numpy as np
import pandas as pd
import graph_tool.all as gt
import matplotlib.pyplot as plt
import seaborn as sns

gt.seed_rng(1)
np.random.seed(1)

# In[2]:

# relevant file paths
data_dir = './data'
processed_graph = os.path.join(data_dir, 'eadage_generic_graph_unsigned.gt')

# In[3]:

G = gt.load_graph(processed_graph)
# make sure vertex/edge properties exist
print(G)
print(list(G.vp.keys()))
print(list(G.ep.keys()))
def generate_graph():
    """
    brew tap homebrew/science
    brew install graph-tool
    """

    from graph_tool.all import price_network, sfdp_layout, graph_draw
    from graph_tool.all import dfs_search, DFSVisitor, seed_rng
    from numpy.random import seed

    class AnnotationVisitor(DFSVisitor):
        def __init__(self, pred, dist):
            self.pred = pred
            self.dist = dist
            self.roots = {}

        def tree_edge(self, e):
            depth = self.dist[e.source()]
            if depth == 1:
                genre = int(e.source())
                if genre not in self.roots:
                    self.roots[genre] = len(self.roots)
            else:
                genre = self.pred[e.source()]
            self.pred[e.target()] = genre
            self.dist[e.target()] = depth + 1

    # For run-to-run stability, provide a constant seed:
    seed(SEED)
    seed_rng(SEED)

    print 'Generating graph...'
    g = price_network(2000)

    print 'Performing layout...'
    pos = sfdp_layout(g)

    print 'Adding depths...'
    dist = g.new_vertex_property("int")
    pred = g.new_vertex_property("int64_t")
    g.set_directed(False)
    visitor = AnnotationVisitor(pred, dist)
    dfs_search(g, g.vertex(0), visitor)

    print 'Iterating over verts...'
    flattened = []
    maxp = [-9999, -9999]
    minp = [+9999, +9999]
    maxd = 0
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, z = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        minp[0] = min(minp[0], x)
        minp[1] = min(minp[1], y)
        maxp[0] = max(maxp[0], x)
        maxp[1] = max(maxp[1], y)
        maxd = max(maxd, dist.a[v])
        flattened += [x, y, z]

    print 'max depth is', maxd
    print 'nroots is', len(visitor.roots)
    print 'ncolors is', len(COLORS)

    extent = (maxp[0] - minp[0], maxp[1] - minp[1])
    padding = extent[0] * PADDING_FRACTION
    minp[0] -= padding
    minp[1] -= padding
    maxp[0] += padding
    maxp[1] += padding
    scale = [1.0 / (maxp[0] - minp[0]), 1.0 / (maxp[1] - minp[1])]
    scale = min(scale[0], scale[1])
    midp = [0.5 * (maxp[0] + minp[0]), 0.5 * (maxp[1] + minp[1])]
    flatarray = []
    for v in g.vertices():
        root_id = pred.a[v]
        if root_id not in visitor.roots:
            continue
        x, y, root = pos[v].a[0], pos[v].a[1], visitor.roots[root_id]
        x = (0.5 + (x - midp[0]) * scale)
        y = (0.5 + (y - midp[1]) * scale)
        prom = int(dist.a[v])
        flatarray += [x, y, root, prom]
    return flatarray
Beispiel #11
0
rcParams["ps.usedistiller"] = "xpdf"
rcParams["pdf.compression"] = 9
rcParams["ps.useafm"] = True
rcParams["path.simplify"] = True
rcParams["text.latex.preamble"] = [  #r"\usepackage{times}",
    #r"\usepackage{euler}",
    r"\usepackage{amssymb}",
    r"\usepackage{amsmath}"
]

import scipy
import scipy.stats
import numpy as np
from pylab import *
from numpy import *
import graph_tool.all as gt
import graph_tool.draw
import random as prandom

figure()

try:
    gt.openmp_set_num_threads(1)
except RuntimeError:
    pass

prandom.seed(42)
np.random.seed(42)
gt.seed_rng(42)
Beispiel #12
0
def planted_model(
    adata: AnnData,
    n_sweep: int = 10,
    beta: float = np.inf,
    tolerance=1e-6,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    deg_corr: bool = True,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 100.),
    steps_anneal: int = 5,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'ppbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Stochastic Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. This function, in particular, uses
    the Planted Block Model, which is particularly suitable in case of
    assortative graphs and it returns the optimal number of communities

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    n_sweep
        Number of MCMC sweeps to get the initial guess
    beta
        Inverse temperature for the initial MCMC sweep        
    tolerance
        Difference in description length to stop MCMC sweep iterations        
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['sbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['sbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['sbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['sbm']['state']`
        The BlockModel state object
    """

    # first things first
    check_gt_version()

    if resume:
        equilibrate = True

    if resume and (key_added not in adata.uns
                   or 'state' not in adata.uns[key_added]):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume = False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`")

    start = logg.info('minimizing the Planted Partition Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if resume:
        # create the state and make sure sampling is performed
        state = adata.uns[key_added]['state'].copy()
        g = state.g
    else:
        if n_init < 1:
            n_init = 1

        # initialize  the block states
        states = [gt.PPBlockState(g) for x in range(n_init)]

        # perform a mcmc sweep on each
        # no list comprehension as I need to collect stats

        _dS = np.zeros(n_init)
        _nattempts = np.zeros(n_init)
        _nmoves = np.zeros(n_init)
        for x in range(n_init):
            t_ds = 1
            while np.abs(t_ds) > tolerance:
                # perform sweep until a tolerance is reached
                t_ds, t_natt, t_nm = states[x].multiflip_mcmc_sweep(
                    beta=beta, niter=n_sweep)
                _dS[x] += t_ds
                _nattempts[x] += t_natt
                _nmoves[x] += t_nm

        _amin = np.argmin([s.entropy() for s in states])
        state = states[_amin]
        dS = _dS[_amin]
        nattempts = _nattempts[_amin]
        nmoves = _nmoves[_amin]

        logg.info('    done', time=start)

    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['mcmc_args'] = {'niter': 10}

        dS, nattempts, nmoves = gt.mcmc_anneal(
            state,
            mcmc_equilibrate_args=equilibrate_args,
            niter=steps_anneal,
            beta_range=beta_range)

    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = np.zeros(g.num_vertices() + 1)

        def _collect_marginals(s):
            group_marginals[s.get_B()] += 1

        gt.mcmc_equilibrate(state,
                            wait=wait,
                            nbreaks=nbreaks,
                            epsilon=epsilon,
                            max_niter=max_iterations,
                            multiflip=True,
                            force_niter=niter_collect,
                            mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = pd.Series(state.get_blocks().get_array()).astype('category')
    new_cat_names = dict([(cx, u'%s' % cn)
                          for cn, cx in enumerate(groups.cat.categories)])
    groups.cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    adata.obs.loc[:, key_added] = groups

    # add some unstructured info

    adata.uns[key_added] = {}
    adata.uns[key_added]['stats'] = dict(dS=dS,
                                         nattempts=nattempts,
                                         nmoves=nmoves,
                                         modularity=gt.modularity(
                                             g, state.get_blocks()))
    adata.uns[key_added]['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.uns[key_added]['group_marginals'] = group_marginals

    # calculate log-likelihood of cell moves over the remaining levels

    # adata.uns[key_added]['cell_affinity'] = {'1':get_cell_loglikelihood(state, as_prob=True, rescale=True)}

    # last step is recording some parameters used in this analysis
    adata.uns[key_added]['params'] = dict(epsilon=epsilon,
                                          wait=wait,
                                          nbreaks=nbreaks,
                                          equilibrate=equilibrate,
                                          collect_marginals=collect_marginals,
                                          random_seed=random_seed)

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Beispiel #13
0
def nested_model(
    adata: AnnData,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    hierarchy_length: int = 10,
    deg_corr: bool = True,
    multiflip: bool = True,
    fast_model: bool = False,
    fast_tol: float = 1e-6,
    n_sweep: int = 10,
    beta: float = np.inf,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 1000.),
    steps_anneal: int = 3,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'nsbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    prune: bool = False,
    return_low: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the nested Stochastic Block Model [Peixoto14]_,
    a hierarchical version of Stochastic Block Model [Holland83]_, performing
    Bayesian inference on node groups. NSBM should circumvent classical
    limitations of SBM in detecting small groups in large graphs
    replacing the noninformative priors used by a hierarchy of priors
    and hyperpriors.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    hierarchy_length
        Initial length of the hierarchy. When large values are
        passed, the top-most levels will be uninformative as they
        will likely contain the very same groups. Increase this valus
        if a very large number of cells is analyzed (>100.000).
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    multiflip
        Whether to perform MCMC sweep with multiple simultaneous moves to sample
        network partitions. It may result in slightly longer runtimes, but under
        the hood it allows for a more efficient space exploration.
    fast_model
        Whether to skip initial minization step and let the MCMC find a solution. 
        This approach tend to be faster and consume less memory, but may be
        less accurate.
    fast_tol
        Tolerance for fast model convergence.
    n_sweep 
        Number of iterations to be performed in the fast model MCMC greedy approach
    beta
        Inverse temperature for MCMC greedy approach    
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    prune
        Some high levels in hierarchy may contain the same information in terms of 
        cell assignments, even if they apparently have different group names. When this
        option is set to `True`, the function only returns informative levels.
        Note, however, that cell affinities are still reported for all levels. Pruning
        does not rename group levels
    return_low
        Whether or not return nsbm_level_0 in adata.obs. This level usually contains
        so many groups that it cannot be plot anyway, but it may be useful for particular
        analysis. By default it is not returned
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell. 
    `adata.uns['nsbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['nsbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['nsbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['nsbm']['state']`
        The NestedBlockModel state object
    """

    if resume:
        # if the fast_model is chosen perform equilibration anyway
        # also if a model has previously created
        equilibrate = True

    if resume and ('nsbm' not in adata.uns
                   or 'state' not in adata.uns['nsbm']):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume = False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`")

    start = logg.info('minimizing the nested Stochastic Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if n_init < 1:
        n_init = 1

    if fast_model:
        # do not minimize, start with a dummy state and perform only equilibrate

        states = [
            gt.NestedBlockState(g=g,
                                state_args=dict(deg_corr=deg_corr,
                                                recs=recs,
                                                rec_types=rec_types))
            for n in range(n_init)
        ]
        for x in range(n_init):
            dS = 1
            while np.abs(dS) > fast_tol:
                # perform sweep until a tolerance is reached
                dS, _, _ = states[x].multiflip_mcmc_sweep(beta=beta,
                                                          niter=n_sweep)

        _amin = np.argmin([s.entropy() for s in states])
        state = states[_amin]

        #        dS = 1
        #        while np.abs(dS) > fast_tol:
        #            dS, nattempts, nmoves = state.multiflip_mcmc_sweep(niter=10, beta=np.inf)
        bs = state.get_bs()
        logg.info('    done', time=start)

    elif resume:
        # create the state and make sure sampling is performed
        state = adata.uns['nsbm']['state'].copy(sampling=True)
        bs = state.get_bs()
        # get the graph from state
        g = state.g
    else:

        states = [
            gt.minimize_nested_blockmodel_dl(
                g,
                deg_corr=deg_corr,
                state_args=dict(recs=recs, rec_types=rec_types),
                **minimize_args) for n in range(n_init)
        ]

        state = states[np.argmin([s.entropy() for s in states])]
        #        state = gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr,
        #                                                 state_args=dict(recs=recs,
        #                                                 rec_types=rec_types),
        #                                                 **minimize_args)
        logg.info('    done', time=start)
        bs = state.get_bs()
        if len(bs) <= hierarchy_length:
            # increase hierarchy length up to the specified value
            # according to Tiago Peixoto 10 is reasonably large as number of
            # groups decays exponentially
            bs += [np.zeros(1)] * (hierarchy_length - len(bs))
        else:
            logg.warning(
                f'A hierarchy length of {hierarchy_length} has been specified\n'
                f'but the minimized model contains {len(bs)} levels')
            pass
        # create a new state with inferred blocks
        state = gt.NestedBlockState(g,
                                    bs,
                                    state_args=dict(recs=recs,
                                                    rec_types=rec_types),
                                    sampling=True)

    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        # equlibration done by simulated annealing

        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['multiflip'] = multiflip
        equilibrate_args['mcmc_args'] = {'niter': 10}

        dS, nattempts, nmoves = gt.mcmc_anneal(
            state,
            mcmc_equilibrate_args=equilibrate_args,
            niter=steps_anneal,
            beta_range=beta_range)
    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = [
            np.zeros(g.num_vertices() + 1) for s in state.get_levels()
        ]

        def _collect_marginals(s):
            levels = s.get_levels()
            for l, sl in enumerate(levels):
                group_marginals[l][sl.get_nonempty_B()] += 1

        gt.mcmc_equilibrate(state,
                            wait=wait,
                            nbreaks=nbreaks,
                            epsilon=epsilon,
                            max_niter=max_iterations,
                            multiflip=True,
                            force_niter=niter_collect,
                            mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = np.zeros((g.num_vertices(), len(bs)), dtype=int)

    for x in range(len(bs)):
        # for each level, project labels to the vertex level
        # so that every cell has a name. Note that at this level
        # the labels are not necessarily consecutive
        groups[:, x] = state.project_partition(x, 0).get_array()

    groups = pd.DataFrame(groups).astype('category')

    # rename categories from 0 to n
    for c in groups.columns:
        new_cat_names = dict([
            (cx, u'%s' % cn)
            for cn, cx in enumerate(groups.loc[:, c].cat.categories)
        ])
        groups.loc[:, c].cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    groups.columns = [
        "%s_level_%d" % (key_added, level) for level in range(len(bs))
    ]

    # remove any column with the same key
    keep_columns = [
        x for x in adata.obs.columns
        if not x.startswith('%s_level_' % key_added)
    ]
    adata.obs = adata.obs.loc[:, keep_columns]
    # concatenate obs with new data, skipping level_0 which is usually
    # crap. In the future it may be useful to reintegrate it
    # we need it in this function anyway, to match groups with node marginals
    if return_low:
        adata.obs = pd.concat([adata.obs, groups], axis=1)
    else:
        adata.obs = pd.concat([adata.obs, groups.iloc[:, 1:]], axis=1)

    # add some unstructured info

    adata.uns['nsbm'] = {}
    adata.uns['nsbm']['stats'] = dict(level_entropy=np.array(
        [state.level_entropy(x) for x in range(len(state.levels))]),
                                      modularity=np.array([
                                          gt.modularity(
                                              g, state.project_partition(x, 0))
                                          for x in range(len((state.levels)))
                                      ]))
    if equilibrate:
        adata.uns['nsbm']['stats']['dS'] = dS
        adata.uns['nsbm']['stats']['nattempts'] = nattempts
        adata.uns['nsbm']['stats']['nmoves'] = nmoves

    adata.uns['nsbm']['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # refrain group marginals. We collected data in vector as long as
        # the number of cells, cut them into appropriate length data
        adata.uns['nsbm']['group_marginals'] = {}
        for nl, level_marginals in enumerate(group_marginals):
            idx = np.where(level_marginals > 0)[0] + 1
            adata.uns['nsbm']['group_marginals'][nl] = np.array(
                level_marginals[:np.max(idx)])

    # prune uninformative levels, if any
    if prune:
        to_remove = prune_groups(groups)
        logg.info(f'    Removing levels f{to_remove}')
        adata.obs.drop(to_remove, axis='columns', inplace=True)

    # calculate log-likelihood of cell moves over the remaining levels
    # we have to calculate events at level 0 and propagate to upper levels
    logg.info('    calculating cell affinity to groups')
    levels = [
        int(x.split('_')[-1]) for x in adata.obs.columns
        if x.startswith(f'{key_added}_level')
    ]
    adata.uns['nsbm']['cell_affinity'] = dict.fromkeys(
        [str(x) for x in levels])
    p0 = get_cell_loglikelihood(state, level=0, as_prob=True)

    adata.uns['nsbm']['cell_affinity'][0] = p0
    l0 = "%s_level_0" % key_added
    for nl, level in enumerate(groups.columns[1:]):
        cross_tab = pd.crosstab(groups.loc[:, l0], groups.loc[:, level])
        cl = np.zeros((p0.shape[0], cross_tab.shape[1]), dtype=p0.dtype)
        for x in range(cl.shape[1]):
            # sum counts of level_0 groups corresponding to
            # this group at current level
            cl[:, x] = p0[:, np.where(cross_tab.iloc[:, x] > 0)[0]].sum(axis=1)
        adata.uns['nsbm']['cell_affinity'][str(nl + 1)] = cl / np.sum(
            cl, axis=1)[:, None]

    # last step is recording some parameters used in this analysis
    adata.uns['nsbm']['params'] = dict(
        epsilon=epsilon,
        wait=wait,
        nbreaks=nbreaks,
        equilibrate=equilibrate,
        fast_model=fast_model,
        collect_marginals=collect_marginals,
        hierarchy_length=hierarchy_length,
        random_seed=random_seed,
        prune=prune,
    )

    logg.info(
        '    finished',
        time=start,
        deep=
        (f'found {state.get_levels()[1].get_nonempty_B()} clusters at level_1, and added\n'
         f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Beispiel #14
0
#!/usr/bin/env python
import sys,os
import time
import pylab as plt
from sbmtm import sbmtm
import graph_tool.all as gt
import numpy as np
from matplotlib import pyplot as plt

gt.openmp_set_num_threads(int(sys.argv[1])) #set num threads
gt.seed_rng(42) #same results

print("Welcome to Topic Modelling")
print("using ",gt.openmp_get_num_threads(), " threads")

if __name__ == '__main__':
	start = time.time()
	print("initialised")
	gt.seed_rng(42)
	print("seed set")
	model = sbmtm()
	print("model created")
	model.load_graph(filename = '/home/filippo/files/graph.xml.gz')
	print("graph loaded")
	print(model.g)
	model.fit(n_init=1, parallel=True, verbose=True)
	#model.fit_overlap(n_init=1, verbose=True, parallel=True)
	#model.plot()
	model.save_data()
	model.dump_model()
	os.system("mv *.csv *.png *.txt *.pkl /home/filippo/files/.")
Beispiel #15
0
    # --------------------
    print(datetime.now())
    # Visually separate analyses
    print('-'*40)
    
    
if __name__ == '__main__':
    # Networks for analysis
    netfiles = ['citenet0']
    #netfiles = ['autnet0']
    #netfiles = ['autnet1']
    #netfiles = ['autnet1', 'autnet0', 'citenet0']

    # Comparison networks
    #compnet_files = ['phnet.graphml']
    compnet_files = ['phnet.graphml', 'ptnet.graphml']
    
    # Set up logging
    logging.basicConfig(level=logging.INFO, format = '%(message)s')
    logger = logging.getLogger()
    logger.addHandler(logging.FileHandler('output/' + str(date.today()) + '.log', 'w'))
    print = logger.info
    
    print('-'*40)
    for netfile in netfiles:
        seed(24680)
        gt.seed_rng(24680)

        run_analysis(netfile, compnet_files)

    print(datetime.now())
Beispiel #16
0
def draw_graph(
    adata: AnnData,
    layout: _Layout = 'sfdp',
    #    init_pos: Union[str, bool, None] = None,
    #    root: Optional[int] = None,
    use_tree: bool = False,
    random_seed: Optional[int] = None,
    adjacency: Optional[spmatrix] = None,
    key_added_ext: Optional[str] = None,
    key: Optional[str] = 'schist',
    copy: bool = False,
    **kwds,
):
    """\
    Extends scanpy.tools.draw_graph function using some layouts available in 
    graph-tool library. Three layouts are available here:
    
    - SFDP spring-block layout.
    - ARF spring-block layout.
    - Fruchterman-Reingold spring-block layout.
    
    Fruchterman-Reingold is already available in scanpy, but here can be used
    to render the nested model tree. 
    
    In order to use these plotting function, the NestedBlockState needs to be
    saved when building the model, so `save_state=True` needs to be set.
    
    Parameters
    ----------
    adata
        Annotated data matrix. A NestedBlockState object needs to be saved
    layout
        A layout among 'sfdp', 'fr' or 'arf'. Other graph-tool layouts haven't been
        implemented.
    use_tree
        When this is set, the tree of the nested model is used to generate layout, 
        otherwise the layout only accounts for the neighborhood graph.    
    random_seed
        Random number to be used as seed for graph-tool
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']`.
    key_added_ext
        By default, append `layout`.
    key
        The slot in `AnnData.uns` containing the state. Default is 'nsbm'
    copy
        Return a copy instead of writing to adata.
    **kwds
        Parameters of chosen igraph layout. See e.g. `fruchterman-reingold`_
        [Fruchterman91]_. One of the most important ones is `maxiter`.

        .. _fruchterman-reingold: http://igraph.org/python/doc/igraph.Graph-class.html#layout_fruchterman_reingold

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following field.

    **X_draw_graph_layout** : `adata.obsm`
        Coordinates of graph layout. E.g. for layout='fa' (the default),
        the field is called 'X_draw_graph_fa'
    """
    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    n_cells = adata.shape[0]
    start = logg.info(f'drawing single-cell graph using layout {layout!r}')
    if layout not in _LAYOUTS:
        raise ValueError(f'Provide a valid layout, one of {_LAYOUTS}.')
    adata = adata.copy() if copy else adata
    if adjacency is None and 'neighbors' not in adata.uns:
        raise ValueError('You need to run `pp.neighbors` first '
                         'to compute a neighborhood graph.')
    if not key in adata.uns:
        raise ValueError(
            'You need to run `nested_model` before trying to run this function '
        )

    if use_tree and 'state' not in adata.uns[key]:
        raise ValueError(
            'When `use_tree` is set to `True`, a state should be saved'
            'running  `nested_model(adata, save_state=True)`.')
    if adjacency is None:
        adjacency = adata.uns['neighbors']['connectivities']

    g = get_graph_tool_from_adjacency(adjacency)
    weights = g.ep['weight']
    if use_tree:
        state = state_from_blocks(adata)
        g, _, _ = gt.get_hierarchy_tree(state, empty_branches=False)
        weights = None

    # actual drawing
    positions = np.zeros((n_cells, 2))
    if layout == 'fr':
        positions = gt.fruchterman_reingold_layout(g, weight=weights)
        positions = np.array([x for x in positions][:n_cells])
    elif layout == 'sfdp':
        positions = gt.sfdp_layout(g)
        positions = np.array([x for x in positions][:n_cells])
    elif layout == 'arf':
        positions = gt.arf_layout(g)
        positions = np.array([x for x in positions][:n_cells])

    adata.uns['draw_graph'] = {}
    adata.uns['draw_graph']['params'] = dict(layout=layout,
                                             random_seed=random_seed)
    key_added = f'X_draw_graph_{layout}'
    adata.obsm[key_added] = positions
    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              f'    {key_added!r}, graph_drawing coordinates (adata.obsm)'),
    )
    return adata if copy else None
Beispiel #17
0
def run_analysis(netfile, compnet_files):
    '''
    Run the analysis.  
    :param netfile: Filename of the network to analyze
    :param compnet_files: List of filenames of the comparison networks, viz.,
        the high-energy physics networks.  
    '''
    
    # Timestamp
    # --------------------
    print(datetime.now())
    
    # Load the network
    # --------------------
    net, outfile_pre, core_pmap, core_vertices = load_net(netfile + '.graphml', 
                                                             core = True,
                                                             filter = True)
    output_folder = 'output/'
    outfile_pre = output_folder + outfile_pre
     
    # Plotting
    print('Plotting')
    layout = layout_and_plot(net, core_pmap, outfile_pre)
    # Store the layout in the net
    net.vp['layout'] = layout
    # Show only the core vertices    
    net.set_vertex_filter(core_pmap)
    layout_and_plot(net, core_pmap, outfile_pre, filename_mod = '.core.net', 
    				reverse_colors = True)
    net.set_vertex_filter(None)
    
    # Vertex statistics
    # --------------------
    # ECDF for out-degree distribution
    degree_dist(net, core_vertices, outfile = outfile_pre, 
                show_plot = False, save_plot = True)
    # ECDF for eigenvector centrality
    ## Currently this is causing a segmentation fault
#     ev_centrality_dist(net, core_vertices, outfile = outfile_pre, 
#                 show_plot = False, save_plot = True)
    
    # Modularity
    # --------------------
    # Calculate modularity, using the core vertices as the partition
    modularity = gtcomm.modularity(net, core_pmap)
    print('Observed modularity: ' + str(modularity))
    obs_ins = insularity(net, core_pmap)
    print('Observed insularity: ' + str(obs_ins))
   
    # Calculate the number of core vertices
    n_core = len(core_vertices)
    # Construct a sampling distribution for the modularity statistic
    #  And use it to calculate a p-value for the modularity
    print('Random sample modularity')
    modularity_sample_dist(net, n_core, modularity,
                                outfile = outfile_pre + '.mod', 
                                show_plot = False, save_plot = True)
    print('Random sample insularities')
    modularity_sample_dist(net, n_core, obs_ins, 
                                mod_func = insularity, 
                                outfile = outfile_pre + '.ins',
                                show_plot = False, save_plot = True)
    
    # Information-theoretic partitioning
    print('Information-theoretic partitioning')
    # Calculate the partition
    gt.seed_rng(5678)
    np.random.seed(5678)
    part_block = gt.minimize_blockmodel_dl(net, B_min = 2, B_max = 2, 
    										verbose = True, 
    										overlap = False)
    # Extract the block memberships as a pmap
    net.vp['partition'] = part_block.get_blocks()
    # Calculate the modularity
    block_modularity = gtcomm.modularity(net, net.vp['partition'])
    print('Partion modularity: ' + str(block_modularity))
    print('Partition insularities')
    block_insularities = partition_insularity(net, net.vp['partition'])
    for community in block_insularities:
        print('Community ' + str(community) + ': ' + 
                str(block_insularities[community]))
    
    print('Plotting')
    size_pmap = gt.prop_to_size(core_pmap, mi = 10, ma = 20)
    layout_and_plot(net, net.vp['partition'], outfile_pre,
                        size_pmap = size_pmap, filename_mod = '.partition')
    
    # Modularity optimization
    optimal_sample_dist(net, modularity, obs_ins,
                                outfile = outfile_pre, 
                                show_plot = False, save_plot = True)
    

    # Save results
    # --------------------
    # The above covers all of the analysis to be written into the output files,
    #  so we'll go ahead and save things now.  
    print('Saving')
    # Save in graph-tool's binary format
    net.save(outfile_pre + '.out' + '.gt')
    # Replace vector-type properties with strings
    #net.list_properties()
    properties = net.vertex_properties
    for property_key in properties.keys():
        property = properties[property_key]
        if 'vector' in property.value_type():
            properties[property_key] = property.copy(value_type = 'string')
    # Save as graphml
    net.save(outfile_pre + '.out' + '.graphml')


    # Comparison networks
    # --------------------
    for compnet_file in compnet_files:
        # Load the comparison network
        compnet, compnet_outfile = load_net(compnet_file)
        # Set it to the same directedness as the network of interest
        compnet.set_directed(net.is_directed())
        # Size of compnet
        n_compnet = compnet.num_vertices()
        # Num vertices in compnet to use in each random partition
        k_compnet = round(n_core / net.num_vertices() * n_compnet)
        # Sample distribution based on random partition
        print('Random sample modularities')
        print('Observed modularity: ' + str(modularity))
        modularity_sample_dist(compnet, k_compnet, modularity, 
                                outfile = outfile_pre + '.mod.' + compnet_outfile, 
                                show_plot = False, save_plot = True)
        print('Random sample insularities')
        print('Observed insularity: ' + str(obs_ins))
        modularity_sample_dist(compnet, k_compnet, obs_ins, 
                                mod_func = insularity, 
                                outfile = outfile_pre + '.ins.' + compnet_outfile,
                                show_plot = False, save_plot = True)
        # Sample distribution based on optimizing modularity
#         optimal_sample_dist(compnet, modularity, n_samples = 300, 
#                                 outfile = outfile_pre + '.mod.' + compnet_outfile,  
#                                 show_plot = False)


    # Timestamp
    # --------------------
    print(datetime.now())
    # Visually separate analyses
    print('-'*40)
Beispiel #18
0
def find_communities(nnodes, edges, alg, params=None):
    def membership2cs(membership):
        cs = {}
        for i, m in enumerate(membership):
            cs.setdefault(m, []).append(i)
        return cs.values()

    def connected_subgraphs(G: nx.Graph):
        for comp in nx.connected_components(G):
            sub = nx.induced_subgraph(G, comp)
            sub = nx.convert_node_labels_to_integers(sub,
                                                     label_attribute='old')
            yield sub

    def apply_subgraphs(algorithm, **params):
        cs = []
        for sub in connected_subgraphs(G):
            if len(sub.nodes) <= 3:
                coms = [sub.nodes]  # let it be a cluster
            else:
                coms = algorithm(sub, **params)
                if hasattr(coms, 'communities'):
                    coms = coms.communities

            for com in coms:
                cs.append([sub.nodes[i]['old'] for i in set(com)])
        return cs

    def karate_apply(algorithm, graph, **params):
        model = algorithm(**params)
        model.fit(graph)
        return membership2cs(model.get_memberships().values())

    if alg == 'big_clam':
        c = -1 if params['c'] == 'auto' else int(params['c'])
        cs = BigClam('../../snap').run(edges, c=c, xc=int(params['xc']))
    elif alg in ('gmm', 'kclique', 'lprop', 'lprop_async', 'fluid',
                 'girvan_newman', 'angel', 'congo', 'danmf', 'egonet_splitter',
                 'lfm', 'multicom', 'nmnf', 'nnsed', 'node_perception', 'slpa',
                 'GEMSEC', 'EdMot', 'demon'):
        G = nx.Graph()
        G.add_edges_from(edges)

        if alg == 'gmm':
            cs = community.greedy_modularity_communities(G)
        elif alg == 'kclique':
            params = {k: float(v) for k, v in params.items()}
            cs = community.k_clique_communities(G, **params)
        elif alg == 'lprop':
            cs = community.label_propagation_communities(G)
        elif alg == 'lprop_async':
            cs = community.asyn_lpa_communities(G, seed=0)
        elif alg == 'fluid':
            params = {k: int(v) for k, v in params.items()}
            params['seed'] = 0
            cs = apply_subgraphs(community.asyn_fluidc, **params)
        elif alg == 'girvan_newman':
            comp = community.girvan_newman(G)
            for cs in itertools.islice(comp, int(params['k'])):
                pass
        elif alg == 'angel':
            params = {k: float(v) for k, v in params.items()}
            cs = cdlib.angel(G, **params).communities
        elif alg == 'congo':  # too slow
            ncoms = int(params['number_communities'])
            cs = []
            for sub in connected_subgraphs(G):
                if len(sub.nodes) <= max(3, ncoms):
                    cs.append(sub.nodes)  # let it be a cluster
                else:
                    coms = cdlib.congo(sub,
                                       number_communities=ncoms,
                                       height=int(params['height']))
                    for com in coms.communities:
                        cs.append([sub.nodes[i]['old'] for i in set(com)])
        elif alg == 'danmf':  # no overlapping
            cs = apply_subgraphs(cdlib.danmf)
        elif alg == 'egonet_splitter':
            params['resolution'] = float(params['resolution'])
            cs = apply_subgraphs(cdlib.egonet_splitter, **params)
        elif alg == 'lfm':
            coms = cdlib.lfm(G, float(params['alpha']))
            cs = coms.communities
        elif alg == 'multicom':
            cs = cdlib.multicom(G, seed_node=0).communities
        elif alg == 'nmnf':
            params = {k: int(v) for k, v in params.items()}
            cs = apply_subgraphs(cdlib.nmnf, **params)
        elif alg == 'nnsed':
            cs = apply_subgraphs(cdlib.nnsed)
        elif alg == 'node_perception':  # not usable
            params = {k: float(v) for k, v in params.items()}
            cs = cdlib.node_perception(G, **params).communities
        elif alg == 'slpa':
            params["t"] = int(params["t"])
            params["r"] = float(params["r"])
            cs = cdlib.slpa(G, **params).communities
        elif alg == 'demon':
            params = {k: float(v) for k, v in params.items()}
            cs = cdlib.demon(G, **params).communities
        elif alg == 'GEMSEC':
            # gamma = float(params.pop('gamma'))
            params = {k: int(v) for k, v in params.items()}
            # params['gamma'] = gamma
            params['seed'] = 0
            _wrap = partial(karate_apply, karateclub.GEMSEC)
            cs = apply_subgraphs(_wrap, **params)
        elif alg == 'EdMot':
            params = {k: int(v) for k, v in params.items()}
            _wrap = partial(karate_apply, karateclub.EdMot)
            cs = apply_subgraphs(_wrap, **params)

    elif alg in ('infomap', 'community_leading_eigenvector', 'leig',
                 'multilevel', 'optmod', 'edge_betweenness', 'spinglass',
                 'walktrap', 'leiden', 'hlc'):
        G = igraph.Graph()
        G.add_vertices(nnodes)
        G.add_edges(edges)

        if alg == 'infomap':
            vcl = G.community_infomap(trials=int(params['trials']))
            cs = membership2cs(vcl.membership)
        elif alg == 'leig':
            clusters = None if params['clusters'] == 'auto' else int(
                params['clusters'])
            vcl = G.community_leading_eigenvector(clusters=clusters)
            cs = membership2cs(vcl.membership)
        elif alg == 'multilevel':
            vcl = G.community_multilevel()
            cs = membership2cs(vcl.membership)
        elif alg == 'optmod':  # too long
            membership, modularity = G.community_optimal_modularity()
            cs = membership2cs(vcl.membership)
        elif alg == 'edge_betweenness':
            clusters = None if params['clusters'] == 'auto' else int(
                params['clusters'])
            dendrogram = G.community_edge_betweenness(clusters, directed=False)
            try:
                clusters = dendrogram.as_clustering()
            except:
                return []
            cs = membership2cs(clusters.membership)
        elif alg == 'spinglass':  # only for connected graph
            vcl = G.community_spinglass(parupdate=True,
                                        update_rule=params['update_rule'],
                                        start_temp=float(params['start_temp']),
                                        stop_temp=float(params['stop_temp']))
            cs = membership2cs(vcl.membership)
        elif alg == 'walktrap':
            dendrogram = G.community_walktrap(steps=int(params['steps']))
            try:
                clusters = dendrogram.as_clustering()
            except:
                return []
            cs = membership2cs(clusters.membership)
        elif alg == 'leiden':
            vcl = G.community_leiden(
                objective_function=params['objective_function'],
                resolution_parameter=float(params['resolution_parameter']),
                n_iterations=int(params['n_iterations']))
            cs = membership2cs(vcl.membership)
        elif alg == 'hlc':
            algorithm = HLC(G, min_size=int(params['min_size']))
            cs = algorithm.run(None)

    elif alg in ("sbm", "sbm_nested"):
        np.random.seed(42)
        gt.seed_rng(42)

        G = gt.Graph(directed=False)
        G.add_edge_list(edges)

        deg_corr = bool(params['deg_corr'])
        B_min = None if params['B_min'] == 'auto' else int(params['B_min'])
        B_max = None if params['B_max'] == 'auto' else int(params['B_max'])

        if alg == "sbm":
            state = gt.minimize_blockmodel_dl(G,
                                              deg_corr=deg_corr,
                                              B_min=B_min,
                                              B_max=B_max)

            membership = state.get_blocks()
            cs = membership2cs(membership)
        if alg == "sbm_nested":
            state = gt.minimize_nested_blockmodel_dl(G,
                                                     deg_corr=deg_corr,
                                                     B_min=B_min,
                                                     B_max=B_max)
            levels = state.get_bs()
            level_max = int(params['level'])

            membership = {}
            for nid in range(nnodes):
                cid = nid
                level_i = len(levels)
                for level in levels:
                    cid = level[cid]
                    if level_i == level_max:
                        membership.setdefault(cid, []).append(nid)
                        break
                    level_i -= 1

            cs = membership.values()

    else:
        return None

    return list(cs)
Beispiel #19
0
rcParams["figure.subplot.bottom"] = 0.2

rcParams["image.cmap"] = "hot"

rcParams["text.usetex"] = True

rcParams["ps.usedistiller"] = "xpdf"
rcParams["pdf.compression"] = 9
rcParams["ps.useafm"] = True
rcParams["path.simplify"] = True
rcParams["text.latex.preamble"] = [#"\usepackage{times}",
                                   #"\usepackage{euler}",
                                   r"\usepackage{amssymb}",
                                   r"\usepackage{amsmath}"]

import scipy
import scipy.stats
import numpy as np
from pylab import *
from numpy import *
import graph_tool.all as gt

figure()

try:
    gt.openmp_set_num_threads(1)
except RuntimeError:
    pass
np.random.seed(42)
gt.seed_rng(42)