コード例 #1
0
ファイル: main.py プロジェクト: karlek/avant-graph
def main():
    """
    Visualizes the research network of KTH as a graph.
    """
    start_time = time()

    # Create our undirected graph to return.
    g = Graph(directed=False)
    # The edge properties measuring collaboration.
    e_times = g.new_edge_property("float")
    # Grouping value for the verticies, verticies are in the same group if the
    # have the same value.
    v_groups = g.new_vertex_property("int")
    # Color the verticies based on their faculties colors.
    v_colors = g.new_vertex_property("vector<double>")

    db_path = '/home/_/kth/kexet/db/kex.db'
    query = """SELECT *
               FROM final
               WHERE (
                 name LIKE '%kth%' and
                 name LIKE '%;%' and
                 keywords is not null and
                 year >= 2013 and
                 ContentType = 'Refereegranskat' and
                 PublicationType = 'Artikel i tidskrift'
               );"""
    rows = load.rows(db_path, query)
    for row in rows:
        nobjs = parse.names(row['name'].split(';'))
        graph.add_relation(g, nobjs, e_times, v_colors, v_groups)

    g.edge_properties["times"] = e_times
    g.vertex_properties["colors"] = v_colors
    g.vertex_properties["groups"] = v_groups

    log.info(g.num_vertices())
    log.info(g.num_edges())
    g.save('a.gt')
    log.info('graph saved: a.gt')
    log.info("db & parse %ss" % round(time() - start_time, 2))

    # start_time = time()
    # g = load_graph('a.gt')
    # log.info("loading %ss" % round(time() - start_time, 2))

    draw.largest(g.copy())
    draw.radial_highest(g.copy())
    draw.sfdp(g.copy())
    draw.grouped_sfdp(g.copy())
    draw.min_tree(g.copy())
    draw.radial_random(g.copy())
    draw.hierarchy(g.copy())
    draw.minimize_blockmodel(g.copy())
    draw.netscience(g.copy())
    draw.fruchterman(g.copy())
コード例 #2
0
class GraphClass:

	#------------#
	# Initialize #
	#------------#

	def __init__ (self, dicProp={"Name": "Graph", "Type": "None", "Weighted": False}, graph=None):
		''' init from properties '''
		self.dicProperties = deepcopy(dicProp)
		self.dicGetProp = { "Reciprocity": get_reciprocity, "Clustering": get_clustering, "Assortativity": get_assortativity,
							"Diameter": get_diameter, "SCC": get_num_scc, #"Spectral radius": get_spectral_radius, 
							"WCC": get_num_wcc, "InhibFrac": get_inhib_frac }
		self.dicGenGraph = { "Erdos-Renyi": gen_er, "Free-scale": gen_fs, "EDR": gen_edr }
		# create a graph
		if graph != None:
			# use the one furnished
			self.__graph = graph
			self.update_prop()
			self.bPropToDate = True
		elif dicProp["Type"] == "None":
			# create an empty graph
			self.__graph = Graph()
			self.bPropToDate = False
		else:
			# generate a graph of the requested type
			self.__graph = self.dicGenGraph[dicProp["Type"]](self.dicProperties)
			self.update_prop()
			self.set_name()
			self.bPropToDate = True

	@classmethod
	def from_graph_class(cls, graphToCopy):
		''' create new GraphClass instance as a deepcopy of another '''
		dicProperties = deepcopy(graphToCopy.get_dict_properties())
		gtGraph = graphToCopy.get_graph().copy()
		# create
		graphClass = cls(dicProperties, gtGraph)
		# set state of properties
		bPropToDate = deepcopy(graphToCopy.bPropToDate)
		bBetwToDate = deepcopy(graphToCopy.bBetwToDate)
		graphClass.bPropToDate = bPropToDate
		graphClass.bBetwToDate = bBetwToDate
		return graphClass

	def copy(self):
		''' returns a deepcopy of the graphClass instance '''
		graphCopy = GraphClass()
		graphCopy.set_graph(self.__graph.copy())
		graphCopy.update_prop()
		graphCopy.set_name(self.dicProperties["Name"]+'_copy')
		return graphCopy

	#---------------------------#
	# Manipulating the gt graph #
	#---------------------------#

	def set_graph(self, gtGraph):
		''' acquire a graph_tool graph as its own '''
		if gtGraph.__class__ == Graph:
			self.__graph = gtGraph
		else:
			raise TypeError("The object passed to 'copy_gt_graph' is not a < class 'graph_tool.Graph' > but a {}".format(gtGraph.__class__))

	def inhibitory_subgraph(self):
		''' create a GraphClass instance which graph contains only
		the inhibitory connections of the current instance's graph '''
		graph = self.graph.copy()
		epropType = graph.new_edge_property("bool",-graph.edge_properties["type"].a+1)
		graph.set_edge_filter(epropType)
		inhibGraph = GraphClass()
		inhibGraph.set_graph(Graph(graph,prune=True))
		inhibGraph.set_prop("Weighted", True)
		return inhibGraph

	def excitatory_subgraph(self):
		''' create a GraphClass instance which graph contains only
		the excitatory connections of the current instance's graph '''
		graph = self.graph.copy()
		epropType = graph.new_edge_property("bool",graph.edge_properties["type"].a+1)
		graph.set_edge_filter(epropType)
		excGraph = GraphClass()
		excGraph.set_graph(Graph(graph,prune=True))
		excGraph.set_prop("Weighted", True)
		return excGraph

	#-------------------------#
	# Set or update functions #
	#-------------------------#
		
	def set_name(self,name=""):
		''' set graph name '''
		if name != "":
			self.dicProperties["Name"] = name
		else:
			strName = self.dicProperties["Type"]
			tplUse = ("Nodes", "Edges", "Distribution")
			for key,value in self.dicProperties.items():
				if key in tplUse and (value.__class__ != dict):
					strName += '_' + key[0] + str(value)
				if key == "Clustering":
					strName += '_' + key[0] + str(around(value,4))
			self.dicProperties["Name"] = strName
		print(self.dicProperties["Name"])

	def update_prop(self, lstProp=[]):
		''' update part or all of the graph properties '''
		if lstProp:
			for strPropName in lstProp:
				if strPropName in self.dicGetProp.keys():
					self.dicProperties[strPropName] = self.dicGetProp[strPropName](self.__graph)
				else:
					print("Ignoring unknown property '{}'".format(strPropName))
		else:
			self.dicProperties.update({ strPropName: self.dicGetProp[strPropName](self.__graph) for strPropName in self.dicGetProp.keys() })
			self.bPropToDate = True

	#---------------#
	# Get functions #
	#---------------#

	## basic properties

	def get_name(self):
		return self.dicProperties["Name"]
	
	def num_vertices(self):
		return self.__graph.num_vertices()

	def num_edges(self):
		return self.__graph.num_edges()

	def get_density(self):
		return self.__graph.num_edges()/float(self.__graph.num_vertices()**2)

	def is_weighted(self):
		return self.dicProperties["Weighted"]

	## graph and adjacency matrix
	
	def get_graph(self):
		self.bPropToDate = False
		self.bBetwToDate = False
		self.wBetweeness = False
		return self.__graph

	def get_mat_adjacency(self):
		return adjacency(self.__graph, self.get_weights())

	## complex properties
	
	def get_prop(self, strPropName):
		if strPropName in self.dicProperties.keys():
			if not self.bPropToDate:
				self.dicProperties[strPropName] = self.dicGetProp[strPropName](self.__graph)
			return self.dicProperties[strPropName]
		else:
			print("Ignoring request for unknown property '{}'".format(strPropName))

	def get_dict_properties(self):
		return self.dicProperties

	def get_degrees(self, strType="total", bWeights=True):
		lstValidTypes = ["in", "out", "total"]
		if strType in lstValidTypes:
			return degree_list(self.__graph, strType, bWeights)
		else:
			print("Ignoring invalid degree type '{}'".format(strType))
			return None

	def get_betweenness(self, bWeights=True):
		if bWeights:
			if not self.bWBetwToDate:
				self.wBetweeness = betweenness_list(self.__graph, bWeights)
				self.wBetweeness = True
			return self.wBetweeness
		if not self.bBetwToDate and not bWeights:
			self.betweenness = betweenness_list(self.__graph, bWeights)
			self.bBetwToDate = True
			return self.betweenness

	def get_types(self):
		if "type" in self.graph.edge_properties.keys():
			return self.__graph.edge_properties["type"].a
		else:
			return repeat(1, self.__graph.num_edges())
	
	def get_weights(self):
		if self.dicProperties["Weighted"]:
			epropW = self.__graph.edge_properties["weight"].copy()
			epropW.a = multiply(epropW.a, self.__graph.edge_properties["type"].a)
			return epropW
		else:
			return self.__graph.edge_properties["type"].copy()
コード例 #3
0
    def __init__(self,
                 nodes=0,
                 copy_graph=None,
                 weighted=True,
                 directed=True,
                 **kwargs):
        '''
        @todo: document that
        see :class:`gt.Graph`'s constructor '''
        self._nattr = _GtNProperty(self)
        self._eattr = _GtEProperty(self)

        self._edges_deleted = False

        g = copy_graph.graph if copy_graph is not None else None

        if g is not None:
            from graph_tool import Graph as GtGraph
            from graph_tool.stats import remove_parallel_edges

            num_edges = copy_graph.edge_nb()

            if copy_graph._edges_deleted:
                # set edge filter for non-deleted edges
                eprop = g.new_edge_property("bool",
                                            vals=np.ones(num_edges,
                                                         dtype=bool))

                g.set_edge_filter(eprop)
                g = GtGraph(g, directed=g.is_directed(), prune=True)

            if not directed and g.is_directed():
                g = g.copy()
                g.set_directed(False)
                remove_parallel_edges(g)
            elif directed and not g.is_directed():
                g = g.copy()
                g.set_directed(True)

            self._from_library_graph(g, copy=True)

            # make edge id property map
            if "eid" in g.edge_properties:
                g.edge_properties["eid"].a = list(range(num_edges))
            else:
                eids = self._graph.new_edge_property("int",
                                                     vals=list(
                                                         range(self._max_eid)))

                g.edge_properties["eid"] = eids

            self._max_eid = num_edges
        else:
            self._graph = nngt._config["graph"](directed=directed)

            if nodes:
                self._graph.add_vertex(nodes)

            # make edge id property map
            self._max_eid = 0

            eids = self._graph.new_edge_property("int")

            self._graph.edge_properties["eid"] = eids
コード例 #4
0
class Interactome:
    r'''
    Attributes:
        interactome_path (str):
            the path to the tsv file containing the interactome per se
        namecode (str):
            the name used to recover the (sub)interactome later
        G (:class:`graph_tool.Graph`):
            the internal representation of the interactome as a graph
        genes2vertices (dict):
            mapping Entrez gene :math:`\rightarrow` set of vertices in ``self.G``
        genes (set):
            set of Entrez names of genes present in ``self.G``
        lcc_cache (dict):
            mapping a number of genes to the LCC size of the uniformly sampled subgraphs of this size
        density_cache (dict):
            mapping a number of genes to the density of the uniformly sampled subgraphs of this size
        clustering_cache (dict):
            mapping a number of genes to the clustering coefficient of the uniformly sampled subgraphs of this size
        distances (2D :class:`np.ndarray`):
            matrix of shortest paths from gene :math:`i` to gene :math:`j`
    '''
    def __init__(self, path, namecode=None):
        self.interactome_path = path
        self.namecode = namecode
        self.distances = None
        log('Loading interactome')
        if path is not None:
            self.load_network(path)
        log('interactome loaded')
        self.lcc_cache = self.density_cache = self.clustering_cache = None

    def get_gene_degree(self, gene):
        '''
        Get the degree of a given gene within the interactome.

        Args:
            gene (int): Entrez ID of the gene

        Return:
            int:
                `None` if the gene is not in :math:`\mathscr I` else the number of associated genes within the interactome
        '''
        if gene not in self.genes:
            return None
        vert_id = self.vert_id(gene)
        return self.G.vertex(vert_id).out_degree()

    def set_namecode(self, namecode):
        assert isinstance(namecode, str)
        self.namecode = namecode

    def get_lcc_cache(self):
        '''
        Return the cache of LCC sizes. WARNING: no copy is made.
        Modifying the returned cache can result in undefined behaviour.
        '''
        self.load_lcc_cache()
        return self.lcc_cache

    def load_lcc_cache(self):
        '''Load the cache of LCC sizes simulations if exists, else creates an empty one.'''
        if self.lcc_cache is None:
            self.lcc_cache = IO.load_lcc_cache(self)

    def get_density_cache(self):
        '''
        Return the cache of density. WARNING: no copy is made.
        Modifying the returned cache can result in undefined behaviour.
        '''
        self.load_density_cache()
        return self.density_cache

    def load_density_cache(self):
        '''Load the cache of density simulations if exists, else creates an empty one.'''
        if self.density_cache is None:
            self.density_cache = IO.load_density_cache(self)

    def get_clustering_cache(self):
        '''
        Return the cache of clustering coefficients. WARNING: no copy is made.
        Modifying the returned cache can result in undefined behaviour.
        '''
        self.load_clustering_cache()
        return self.clustering_cache

    def load_clustering_cache(self):
        '''Load the cache of clustering coefficient simulations if exists, else creates an empty one.'''
        if self.clustering_cache is None:
            self.clustering_cache = IO.load_clustering_cache(self)

    def load_network(self, path):
        '''
        Load the interactome stored in a tsv file

        Args:
            path: the path of the interactome file
        '''
        self.G = Graph(directed=False)
        self.genes2vertices = dict()
        with open(path) as f:
            reader = csv.reader(f, delimiter='\t')
            for genes in reader:
                gene1, gene2 = map(int, genes)
                self.add_vertex(gene1)
                self.add_vertex(gene2)
                self.G.add_edge(self.vert_id(gene1), self.vert_id(gene2))
        self.genes = set(self.genes2vertices.keys())
        self.vertices2genes = {v: g for g, v in self.genes2vertices.items()}
        self.compute_spls()

    def add_vertex(self, gene):
        '''
        Create new vertex for `gene` in the graph if not yet present

        Args:
            gene: the name of the gene to ad in the interactome
        '''
        if gene not in self.genes2vertices:
            self.genes2vertices[gene] = len(self.genes2vertices)
            self.G.add_vertex()

    def vert_id(self, gene):
        '''
        Return the id of the desired gene

        Args:
            gene: the gene to retrieve

        Returns:
            the id of the desired gene

        Raises:
            KeyError: if no such gene is in the interactome
        '''
        return self.genes2vertices[gene]

    def verts_id(self, genes, gene_to_ignore=None):
        '''
        Return a list of Vertex instances of the desired genes

        Args:
            genes: an iterable of desired genes
            gene_to_ignore: gene in `genes` that is not desired

        Returns:
            a list of Vertex instances of the desired genes

        Raises:
            KeyError: if any of the genes is not in the interactome
        '''
        return np.array(
            [self.vert_id(gene) for gene in genes if gene != gene_to_ignore])

    def compute_spls(self):
        '''Compute the shortest path between each pair of genes.'''
        if self.distances is not None:
            return
        dists = shortest_distance(self.G)
        self.distances = np.empty(
            (self.G.num_vertices(), self.G.num_vertices()), dtype=np.int)
        for idx, array in enumerate(dists):
            self.distances[idx, :] = array.a[:]

    def get_all_dists(self, A, B):
        '''
        Get a list containing all the distances from a gene in A to the gene set B

        Args:
            A: a source gene set
            B: a destination gene set

        Returns:
            a list of distances [d(a, B) s.t. a in A]
        '''
        insert_self = A is B
        all_dists = list()
        for gene1 in A:
            if insert_self:
                for idx, el in enumerate(B):
                    if el == gene1:
                        indices = np.delete(B, idx)
                        break
            else:
                indices = B
            if not indices.any():
                continue
            indices = np.asarray(indices)
            self.compute_spls()
            dists = self.distances[gene1, indices]
            min_dist = np.min(dists)
            if min_dist > self.G.num_vertices():  # if gene is isolated
                continue  # go to next gene
            all_dists.append(min_dist)
        return all_dists

    def get_d_A(self, A):
        '''
        Return the inner distance of the disease module A as defined in [1].

        Args:
            A: a gene set

        Returns:
            :math:`d_A`

        References
        ----------

        [1] J. Menche et al., Science 347 , 1257601 (2015). DOI: 10.1126/science.1257601 http://science.sciencemag.org/content/347/6224/1257601
        '''
        return np.mean(self.get_all_dists(A, A))

    def get_d_AB(self, A, B):
        '''
        Return the graph-based distance between A and B as defined in [1].

        Args:
            A: a gene set
            B: a gene set

        Returns:
            :math:`d_{AB}`

        References
        ----------

        [1] J. Menche et al., Science 347 , 1257601 (2015). DOI: 10.1126/science.1257601 http://science.sciencemag.org/content/347/6224/1257601
        '''
        values = self.get_all_dists(A, B)
        values.extend(self.get_all_dists(B, A))
        return np.mean(values, dtype=np.float32)

    def get_random_subgraph(self, size):
        '''
        Uniformly sample a subgraph of given size.

        Args:
            size: number of genes to sample

        Returns:
            A subgraph of self of given size
        '''
        seeds = np.random.choice(len(self.genes), size=size, replace=False)
        return self.get_subgraph(seeds)

    def get_subgraph(self, vertices, genes=False):
        r'''
        Return the subgraph of self induced by the given vertices.

        Args:
            vertices: a set of vertex IDs (or a set of genes)
            genes: a boolean with value `True` if `vertices` is a set of genes
                and `False` if it is a set of vertex IDs.

        Returns:
            :math:`\Delta_{\text{vertices}}(G)`
        '''
        if genes:
            vertices = self.verts_id(vertices)
        filt = self.G.new_vertex_property('bool')
        filt.a[vertices] = True
        return GraphView(self.G, vfilt=filt)

    def get_genes_lcc_size(self, genes):
        r'''
        Return the LCC size of the graph induced by given genes.

        Args:
            genes: an iterable containing genes

        Returns:
            :math:`|LCC(\Delta_{\text{genes}}(G))|`
        '''
        return _get_lcc_size(self.get_subgraph(np.asarray(genes)))

    def get_random_genes_lcc(self, size):
        r'''
        Return the LCC size of a random subgraph of given size.

        Args:
            size (in): number of genes to sample

        Returns:
            :math:`|LCC(\mathcal G(\text{size}, G))|`
        '''
        return _get_lcc_size(self.get_random_subgraph(size))

    def get_random_genes_density(self, size):
        r'''
        Return the density of a random subgraph of given size.

        Args:
            size (int): number of genes to sample

        Returns:
            :math:`d(\mathcal G(\text{size}, G))`
        '''
        return _get_density(self.get_random_subgraph(size))

    def get_genes_density(self, genes):
        r'''
        Return the density of the subgraph induced by given genes.

        Args:
            genes: an iterable of genes

        Returns:
            :math:`d(\Delta_{\text{genes}}(G))`
        '''
        return _get_density(self.get_subgraph(np.asarray(genes)))

    def get_random_genes_clustering(self, size):
        r'''
        Return the clustering coefficient of a random subgraph of given size.

        Args:
            size (int): number of genes to sample

        Returns:
            :math:`C(\mathcal G(\text{size}, G))`
        '''
        G = self.get_random_subgraph(size)
        ret = _get_clustering_coefficient(G)
        return ret

    def get_genes_clustering(self, genes, entrez=False):
        r'''
        Return the clustering coefficient of the subgraph induced by given genes.

        Args:
            genes: an iterable of genes

        Returns:
            :math:`C(\Delta_{\text{genes}}(G))`
        '''
        if entrez:
            genes = self.verts_id(genes)
        return _get_clustering_coefficient(self.get_subgraph(
            np.asarray(genes)))

    def get_lcc_score(self,
                      genes,
                      nb_sims,
                      shapiro=False,
                      shapiro_threshold=.05):
        r'''
        Get the z-score and the empirical p-value of the LCC size of given genes.

        Args:
            genes (set): gene set
            nb_sims (int): minimum number of simulations for probability distribution estimation
            shapiro (bool): True if normality test is needed, False otherwise (default False)
            shapiro_threshold (float): statistical threshold for normality test

        Returns:
            tuple:
                :math:`(z, p_e, N)` if shapiro is True and :math:`(z, p_e)` otherwise;
                where z is the z-score of the LCC size, :math:`p_e` is the associated
                empirical p-value and N is True if Shapiro-Wilk normality test
                p-value >= shapiro_threshold and False otherwise

        Raises:
            ValueError: if not enough simulations have been performed
        '''
        genes = genes & self.genes
        genes = self.verts_id(genes)
        nb_seeds = len(genes)
        if nb_seeds == 0:
            print('\n\t[Warning: get_lcc_score found no matching gene]')
            return None
        genes_lcc = self.get_genes_lcc_size(genes)
        try:
            lccs = self.get_lcc_cache()[nb_seeds]
            assert len(lccs) >= nb_sims
        except AssertionError:
            raise ValueError(('Only {} simulations found. Expected >= {}. ' + \
                              'fill_lcc_cache has not been called properly') \
                             .format(len(lccs), nb_sims))
        std = lccs.std()
        mean = lccs.mean()
        z = None if std == 0 else float((genes_lcc - mean) / std)
        empirical_p = (lccs >= genes_lcc).sum() / len(lccs)
        if shapiro:
            is_normal = stats.shapiro(np.random.choice(
                lccs, size=5000))[1] >= shapiro_threshold
            return z, empirical_p, is_normal
        return z, empirical_p

    def where_density_cache_nb_sims_lower_than(self, sizes, nb_sims):
        r'''
        Get the sizes whose density hasn't been simulated enough.

        Args:
            sizes (iterable): iterable of int values corresponding to sizes to test
            nb_sims (int): minimal number of simulations

        Returns:
            set:
                set of int values corresponding to sizes that haven't been simulated enough:

                .. math::
                    \{s \in \text{sizes} : |\text{density_cache}[s]| < \text{nb_sims}\}
        '''
        self.load_density_cache()
        return {size for size in sizes \
                     if size not in self.density_cache.keys() \
                     or len(self.density_cache[size]) < nb_sims}

    def where_lcc_cache_nb_sims_lower_than(self, sizes, nb_sims):
        r'''
        Get the sizes whose LCC hasn't been simulated enough.

        Args:
            sizes (iterable): iterable of int values corresponding to sizes to test
            nb_sims (int): minimal number of simulations

        Returns:
            set:
                set of int values corresponding to sizes that haven't been simulated enough:

                .. math::
                    \{s \in \text{sizes} : |\text{lcc_cache}[s]| < \text{nb_sims}\}
        '''
        self.load_lcc_cache()
        return {size for size in sizes \
                     if size not in self.lcc_cache.keys() \
                     or len(self.lcc_cache[size]) < nb_sims}

    def where_clustering_cache_nb_sims_lower_than(self, sizes, nb_sims):
        r'''
        Get the sizes whose clustering coefficient hasn't been simulated enough.

        Args:
            sizes (iterable): iterable of int values corresponding to sizes to test
            nb_sims (int): minimal number of simulations

        Returns:
            set:
                set of int values corresponding to sizes that haven't been simulated enough:

                .. math::
                    \{s \in \text{sizes} : |\text{clustering_cache}[s]| < \text{nb_sims}\}
        '''
        self.load_clustering_cache()
        return {size for size in sizes \
                     if size not in self.clustering_cache.keys() \
                     or len(self.clustering_cache[size]) < nb_sims}

    def fill_lcc_cache(self, nb_sims, sizes):
        r'''
        Fill the lcc_cache such that:

        .. math::
            \forall s \in \text{sizes} : |\text{lcc_cache[n]}| >= \text{nb_sims}

        Args:
            nb_sims (int): minimal number of simulations to be performed
            sizes (set): set of number of genes for which LCC size shall be tested
        '''
        self.load_lcc_cache()
        a = time()
        for idx, size in enumerate(sizes):
            self._compute_lcc_distribution(nb_sims, size)
            prop = (idx + 1) / len(sizes)
            log('{} out of {}  ({:3.2f}%)    eta: {}' \
                .format(idx+1, len(sizes), 100*prop,
                        sec2date((time()-a)/prop*(1-prop))),
                end='\r')
        print('')
        self._write_lcc_cache()

    def fill_density_cache(self, nb_sims, sizes):
        r'''
        Fill the density cache such that:

        .. math::
            \forall s \in \text{sizes} : |\text{density_cache[n]}| \geq \text{nb_sims}

        Args:
            nb_sims (int): minimal number of simulations to be performed
            sizes (set): set of number of genes for which density shall be tested
        '''
        self.load_density_cache()
        a = time()
        for idx, size in enumerate(sizes):
            self._compute_disease_module_density(nb_sims, size)
            prop = (idx + 1) / len(sizes)
            log('{} out of {}  ({:3.2f}%)    eta: {}' \
                .format(idx+1, len(sizes), 100*prop,
                        sec2date((time()-a)/prop*(1-prop))),
                end='\r')
        print('')
        self._write_density_cache()

    def fill_clustering_cache(self, nb_sims, sizes):
        r'''
        Fill the clustering cache such that:

        .. math::
            \forall s \in \text{ßizes} : |\text{clustering_cache[n]}| \geq \text{nb_sims}

        Args:
            nb_sims (int): minimal nuber of simulations to be performed
            sizes (set): set of number of genes for which clustering coefficient shall be tested
        '''
        self.load_clustering_cache()
        a = time()
        for idx, size in enumerate(sizes):
            self._compute_disease_modules_clustering(nb_sims, size)
            prop = (idx + 1) / len(sizes)
            log('{} out of {}  ({:3.2f}%)    eta: {}' \
                .format(idx+1, len(sizes), 100*prop,
                        sec2date((time()-a)/prop*(1-prop))),
                end='\r')
        print('')
        self._write_clustering_cache()

    def get_subinteractome(self,
                           genes,
                           neighbourhood='none',
                           namecode=None,
                           neighb_count=1):
        r'''
        Extract a subinteractome and return it as an :class:`Interactome`
        object which is then usable for analyses.

        For :math:`H` a subgraph of :math:`G`, the first neighbourhood of :math:`H`
        within :math:`G` is defined by the graph:

        .. math::
            \mathcal N_G(H) = \Delta_{\mathcal N_G(V(H))}(G),

        where for every :math:`W \subset V(G)`:

        .. math::
            \mathcal N_G(W) = W \cup \left\{v \in V(G) : \exists w \in V(H) \text{ s.t. } \{v, w\} \in E(G)\right\} \subset V(G).

        Args:
            genes (set): the gene set inducing the subinteractome
            neighbourhood (str):
                one of the following: `'none'`, `'first'`, `'first-joined'` where:

                * `'none'` for no neighbouring gene
                * `'first'` for the first neighbourhood :math:`\mathcal N_G(H)` with :math:`G` being `self` and :math:`H` being `genes`
                * `'first-joined'` for the first neighbourhood with restriction that every neighbourhood gene must be associated to at least `neighb_count` genes.
            namecode (str): the namecode to be given to the subinteractome
            neighb_count (int): (only if `neighbourhood == 'first-joined'`) determines the minimum number of adjacent genes to be extracted:

                .. math::
                    \mathcal N_G^{(k)}(H) := \Delta_{\mathcal N_G^{(k)}}(H),

                with:

                .. math::
                    \mathcal N_G^{(k)}(W) := W \cup \left\{v \in V(G) : \exists \{v_1, \ldots, v_k\} \in \binom {V(H)}k \text{ s.t. } \{v, v_i\} \in E(G)
                        \quad (i=1, \ldots, k)\right\} \subset V(G).

        Return:
            :class:`Interactome`:
                the subinteractome
        '''
        #TODO: implement neighbourhood extraction
        genes &= self.genes
        genes_hash = md5(''.join(sorted(map(
            str, genes))).encode('utf-8')).hexdigest()
        path = self.interactome_path + genes_hash
        ret = IO.load_interactome(path, False, namecode)
        if ret is not None:
            return ret
        ret = deepcopy(self)
        ret.namecode = namecode
        ret.interactome_path = path

        ret.genes, ret.G = self._get_subinteractome_graph(
            genes, neighbourhood, neighb_count)
        print('So {} vertices, {} edges (density == {})' \
              .format(
                ret.G.num_vertices(),
                ret.G.num_edges(),
                2*ret.G.num_edges()/(ret.G.num_vertices()*(ret.G.num_vertices() - 1))
              )
        )
        genes_l = np.array(list(ret.genes))
        # Compute the mappings gene -> idx
        vp = ret.G.vp['genes']
        ret.genes2vertices = {
            vp[vertex]: int(vertex)
            for vertex in ret.G.vertices()
        }
        print('...  {}'.format(len(ret.genes2vertices)))
        del ret.G.vertex_properties['genes']
        del self.G.vertex_properties['genes']
        ret.genes = set(ret.genes2vertices.keys())
        ret.lcc_cache = ret.density_cache = None
        ret.distances = None
        ret.compute_spls()
        IO.save_interactome(ret)
        return ret

    def _get_subinteractome_graph(self, genes, neighbourhood, neighb_count):
        print('Initially: {} genes'.format(len(genes)))
        if neighbourhood is not None and neighbourhood != 'none':
            genes = self._get_genes_neighbourhood(genes, neighbourhood,
                                                  neighb_count)
        vp = self.G.new_vp('int')
        for gene, vertex in self.genes2vertices.items():
            vp[self.G.vertex(vertex)] = gene
        self.G.vertex_properties['genes'] = vp
        genes_l = np.array(list(genes))
        # Extract subgraph with ``genes``
        G = self.get_subgraph(genes, True)
        # Ignore genes of degree 0
        genes_idx = np.where(
            G.get_out_degrees(np.arange(G.num_vertices())) > 0)[0]
        genes = {self.vertices2genes[idx] for idx in genes_idx}
        print('After removing isolated vertices: {} genes'.format(len(genes)))
        return genes, Graph(self.get_subgraph(genes, True), prune=True)

    def _get_genes_neighbourhood(self, genes, neighbourhood, neighb_count):
        raise NotImplementedError()
        # First neighbourhood
        vert2genes = dict()
        for k, v in self.genes2vertices.items():
            vert2genes[v] = k
        closure_genes = set()
        for gene in genes:
            gene_idx = self.genes2vertices[gene]
            for neighbour in self.G.get_out_neighbours(gene_idx):
                closure_genes.add(vert2genes[neighbour])
        return closure_genes | genes

    def copy(self):
        '''
        Return a copy of the interactome
        '''
        ret = deepcopy(self)
        ret.G = self.G.copy()  # watch out: deepcopy(self.G) returns None...
        return ret

    ##### Private methods

    def _compute_lcc_distribution(self, nb_sims, size):
        N = nb_sims
        if size in self.lcc_cache:
            nb_sims -= len(self.lcc_cache[size])
        if nb_sims < 0:
            print('[Warning]: {} sims required but {} already performed' \
                  .format(N, len(self.lcc_cache[size])))
            return
        lccs = np.empty(nb_sims, dtype=np.float)
        for i in range(nb_sims):
            lccs[i] = self.get_random_genes_lcc(size)
        if size in self.lcc_cache:
            self.lcc_cache[size] = np.concatenate((self.lcc_cache[size], lccs))
        else:
            self.lcc_cache[size] = lccs

    def _compute_disease_module_density(self, nb_sims, size):
        N = nb_sims
        if size in self.density_cache:
            nb_sims -= len(self.density_cache[size])
        if size <= 0 or nb_sims <= 0:
            return
        densities = np.empty(nb_sims, dtype=np.float)
        for i in range(nb_sims):
            densities[i] = self.get_random_genes_density(size)
        try:
            densities = np.concatenate((self.density_cache[size], densities))
        except (KeyError, ValueError):
            pass
        self.density_cache[size] = densities

    def _compute_disease_modules_clustering(self, nb_sims, size):
        N = nb_sims
        if size in self.clustering_cache:
            nb_sims -= len(self.clustering_cache[size])
        if size < 3 or nb_sims <= 0:
            return
        clustering_coeffs = np.empty(nb_sims, dtype=np.float)
        for i in range(nb_sims):
            clustering_coeffs[i] = self.get_random_genes_clustering(size)
        try:
            clustering_coeffs = np.concatenate(
                (self.clustering_cache[size], clustering_coeffs))
        except (KeyError, ValueError):
            pass
        self.clustering_cache[size] = clustering_coeffs

    def _write_lcc_cache(self):
        IO.save_lcc_cache(self, self.lcc_cache)

    def _write_density_cache(self):
        IO.save_density_cache(self, self.density_cache)

    def _write_clustering_cache(self):
        IO.save_clustering_cache(self, self.clustering_cache)

    def save(self):
        IO.save_interactome(self)