Example #1
0
def parse_input(graph_file):
    graph = gt.Graph()
    graph.set_directed(is_directed=False)

    with open(graph_file, 'r') as in_file:
        lines = in_file.readlines()
        color_target = int(lines[0])
        for line in lines[1:]:
            source = int(line.split(' ')[0])
            target = int(line.split(' ')[1])
            if source < target:
                graph.add_edge(source=source, target=target, add_missing=True)
            else:
                graph.add_edge(source=target, target=source, add_missing=True)
            if source == target:
                print('error: tried to add loop')
            # print("added edge " + source + ' ' + target)
            # print(i)
            if int(source) == int(target):
                print("self edge")
    gt.remove_parallel_edges(graph)
    list_to_remove = []
    for vertex in graph.vertices():
        if graph.vertex(i=vertex).out_degree() == 0:
            list_to_remove.append(vertex)
    graph.remove_vertex(list_to_remove)

    return color_target, graph
def load_vna(in_file):
    with open(in_file) as f:
        all_lines = f.read().splitlines()

        it = iter(all_lines)

        # Ignore preamble
        line = next(it)
        while not (line.lower().startswith('*node properties')
                   or line.lower().startswith('*node data')):
            line = next(it)

        node_properties = next(it).split(' ')
        node_properties = [word.lower() for word in node_properties]
        assert ('id' in node_properties)

        vertices = dict()
        line = next(it)
        gt_idx = 0  # Index for gt
        while not line.startswith('*'):
            entries = line.split(' ')
            vna_id = entries[0]
            vertex = dict()
            for i, prop in enumerate(node_properties):
                vertex[prop] = entries[i]
            vertex['id'] = gt_idx  # Replace VNA ID by numerical gt index
            vertices[
                vna_id] = vertex  # Retain VNA ID as key of the vertices dict

            gt_idx += 1
            line = next(it)

        # Skip node properties, if any
        while not (line.lower().startswith('*tie data')):
            line = next(it)

        edge_properties = next(it).split(' ')
        assert (edge_properties[0] == 'from' and edge_properties[1] == 'to')

        edges = []
        try:
            while True:
                line = next(it)
                entries = line.split(' ')
                v_i = vertices[entries[0]]['id']
                v_j = vertices[entries[1]]['id']
                edges.append((v_i, v_j))
        except StopIteration:
            pass

        g = gt.Graph(directed=False)
        g.add_vertex(len(vertices))
        for v_i, v_j in edges:
            g.add_edge(v_i, v_j)

        gt.remove_parallel_edges(g)

        return g
    return None
Example #3
0
def load_tulip_layout(in_file):
    g = gt.load_graph(in_file)
    g.set_directed(False)
    gt.remove_parallel_edges(g)
    graphics = g.vertex_properties['graphics']
    Y = np.zeros((g.num_vertices(), 2))
    for i in range(g.num_vertices()):
        Y[i, :] = [graphics[i]['x'], graphics[i]['y']]
    return g, Y
def create_graph(adjacency):
    G = gt.Graph(directed=False)
    G.add_vertex(len(adjacency))
    for idx, v in enumerate(adjacency):
        if idx % 1000 == 0 and idx > 0:
            print idx
        for u in adjacency[v]:
            if not G.edge(v, u):
                G.add_edge(v, u)
    # NOTE: Parallel edges will be removed, if any.
    gt.remove_parallel_edges(G)
    return G
Example #5
0
def load_graph(path, algorithms, format='graphml', component=False):
    sys.stdout.write('Loading network ...')
    sys.stdout.flush()
    t0 = time.time()
    g = gt.load_graph(path, fmt=format)
    if 'kores' in algorithms:
        gt.remove_parallel_edges(g)
    gt.remove_self_loops(g)
    if component:
        largest_component = gt.label_largest_component(g, directed=False)
        g.set_vertex_filter(largest_component)
        g.purge_vertices()
    t = time.time()
    sys.stdout.write('Ok! ({0} s.)\n'.format(t - t0))

    return g
Example #6
0
def save_largest_component():
    global Graph

    l = gt.label_largest_component(Graph)

    print l.a
    remove = []
    for x in xrange(len(l.a)):
        if l.a[x] == 0:
            remove.append(x)

    Graph.remove_vertex(remove)
    #u = gt.GraphView(Graph, vfilt=l)

    gt.remove_parallel_edges(Graph)

    Graph.save(base_path + graph_tool_file)
Example #7
0
def save_largest_component():
    global Graph

    l = gt.label_largest_component(Graph)

    print l.a
    remove = []
    for x in xrange(len(l.a)):
        if l.a[x] == 0:
            remove.append(x)

    Graph.remove_vertex(remove)
    #u = gt.GraphView(Graph, vfilt=l)

    gt.remove_parallel_edges(Graph)

    Graph.save(base_path + graph_tool_file)
Example #8
0
def load_graph(file):
    if os.path.splitext(file)[1] == '.mtx':
        g = load_mm(file)
    elif os.path.splitext(file)[1] == '.csv':
        g = load_csv(file)
    elif os.path.splitext(file)[1] == '.graph':
        g = load_chaco(file)
    elif os.path.splitext(file)[1] == '.vna':
        g = load_vna(file)
    else:
        # Give the file to graph_tool and hope for the best.
        g = gt.load_graph(file)

        g.set_directed(False)

    gt.remove_parallel_edges(g)

    return g
Example #9
0
def sndlib(f):
  from xml.etree import ElementTree as ET

  root = ET.parse(f)
  ns = {'s': 'http://sndlib.zib.de/network'}
  
  g = gt.Graph(directed=False)
  g.add_vertex(len(root.findall('*/*/s:node', ns)))
  
  index = {e.get('id'):i for i,e in enumerate(root.findall('*/*/s:node', ns))}
  for e in root.findall('*/*/s:link', ns):
    g.add_edge(index[e.find('s:source', ns).text], index[e.find('s:target', ns).text])
  pos = g.new_vertex_property('vector<float>', scale(np.array([(float(e.find('*/s:x', ns).text), -float(e.find('*/s:y', ns).text)) for e in root.findall('*/*/s:node', ns)])))
  
  gt.remove_parallel_edges(g)
  gt.remove_self_loops(g)
  
  return g, pos
def remove_equiv_nodes(qgraph, S_Q):
    '''
    Remove equivalent nodes from qgraph.
    u is equivalent to v iff v\in S_Q[u] and u\in S_Q[v]
    '''
    mark = [0 for col in xrange(0, qgraph.num_vertices())]
    for u in qgraph.vertices():
        for v in qgraph.vertices():
            if mark[int(u)] == 0 and mark[int(v)] == 0 and int(u) != int(
                    v) and u in S_Q[v] and v in S_Q[u]:
                for v_p in v.in_neighbors():
                    eg_in = qgraph.edge(v_p, u)
                    if not eg_in:
                        qgraph.add_edge(v_p, u)
                for v_s in v.out_neighbors():
                    eg_out = qgraph.edge(u, v_s)
                    if not eg_out:
                        qgraph.add_edge(u, v_s)
                mark[int(v)] = 1
    qgraph_view = gt.GraphView(qgraph, vfilt=lambda v: mark[int(v)] == 0)
    gt.remove_parallel_edges(qgraph_view)
    qgraph_view.purge_vertices()
    return qgraph_view
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#INPUT
element = int(sys.argv[1])
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
print(element)

#-------------------------------------
#Variables
SCREENING_NETWORK = ""
#------------------------------------

#loading of a second graph used in previous studies. It is larger and shows more than 90% of the 4HAP graph. It potentially includes more feedstock and pharmaceutical regions.
g = gt.load_graph(SCREENING_NETWORK)
gt.remove_parallel_edges(g)

#a timer is started for the CPU comparison of the two methods
start = time.clock()

in_network = pickle.load(open("pharma_in_network.p", "rb"))
in_network_xrn = pickle.load(open("pharma_in_network_xrn.p", "rb"))
HUBS_ID = pickle.load(open("hubs_Int.p", "rb"))
HUBS_XRN = pickle.load(open("hubs_xrn.p", "rb"))

# double-check if the loaded files are correct
print("testing:There are:", len(HUBS_ID), len(HUBS_XRN), " hubs")

#hardcoded biofeed input! change as soon as mre data is available
Biofeed = [9896190, 345687, 2239190]
Example #12
0
def useGraphTool(pd, space):
    # Extract the graphml representation of the planner data
    graphml = pd.printGraphML()
    f = open("graph.xml", 'w')
    f.write(graphml)
    f.close()

    # Load the graphml data using graph-tool
    graph = gt.load_graph("graph.xml")
    edgeweights = graph.edge_properties["weight"]

    # Write some interesting statistics
    avgdeg, stddevdeg = gt.vertex_average(graph, "total")
    avgwt, stddevwt = gt.edge_average(graph, edgeweights)

    print "---- PLANNER DATA STATISTICS ----"
    print str(graph.num_vertices()) + " vertices and " + str(graph.num_edges()) + " edges"
    print "Average vertex degree (in+out) = " + str(avgdeg) + "  St. Dev = " + str(stddevdeg)
    print "Average edge weight = " + str(avgwt)  + "  St. Dev = " + str(stddevwt)

    comps, hist = gt.label_components(graph)
    print "Strongly connected components: " + str(len(hist))

    graph.set_directed(False)  # Make the graph undirected (for weak components, and a simpler drawing)
    comps, hist = gt.label_components(graph)
    print "Weakly connected components: " + str(len(hist))

    # Plotting the graph
    gt.remove_parallel_edges(graph) # Removing any superfluous edges

    edgeweights = graph.edge_properties["weight"]
    colorprops = graph.new_vertex_property("string")
    vertexsize = graph.new_vertex_property("double")

    start = -1
    goal = -1

    for v in range(graph.num_vertices()):

        # Color and size vertices by type: start, goal, other
        if (pd.isStartVertex(v)):
            start = v
            colorprops[graph.vertex(v)] = "cyan"
            vertexsize[graph.vertex(v)] = 10
        elif (pd.isGoalVertex(v)):
            goal = v
            colorprops[graph.vertex(v)] = "green"
            vertexsize[graph.vertex(v)] = 10
        else:
            colorprops[graph.vertex(v)] = "yellow"
            vertexsize[graph.vertex(v)] = 5

    # default edge color is black with size 0.5:
    edgecolor = graph.new_edge_property("string")
    edgesize = graph.new_edge_property("double")
    for e in graph.edges():
        edgecolor[e] = "black"
        edgesize[e]  = 0.5

    # using A* to find shortest path in planner data
    if start != -1 and goal != -1:
        dist, pred = gt.astar_search(graph, graph.vertex(start), edgeweights)

        # Color edges along shortest path red with size 3.0
        v = graph.vertex(goal)
        while v != graph.vertex(start):
            p = graph.vertex(pred[v])
            for e in p.out_edges():
                if e.target() == v:
                    edgecolor[e] = "red"
                    edgesize[e]  = 2.0
            v = p

    # Writing graph to file:
    # pos indicates the desired vertex positions, and pin=True says that we
    # really REALLY want the vertices at those positions
    gt.graph_draw (graph, vertex_size=vertexsize, vertex_fill_color=colorprops,
                   edge_pen_width=edgesize, edge_color=edgecolor,
                   output="graph.png")
    print
    print 'Graph written to graph.png'
Example #13
0
import graph_tool.all as gt
from sys import argv
from re import findall

if __name__ == '__main__':
  for f in argv[1:]:
    g = gt.GraphView(gt.load_graph(f), directed=False, skip_properties=True)
    gt.remove_parallel_edges(g)
    gt.remove_self_loops(g)
    name = findall('[^/.]+', f)[-2].split('--')[0]
    g.save('output/{}.xml'.format(name))
    gt.graph_draw(g, output='output/{}.png'.format(name))

Example #14
0
def trimGraph(grafo, ske, ske2):
    g, pos, weight, clase, nodetype, age = grafo

    edges_to_delete = []
    to_delete = []

    gt.remove_parallel_edges(g)

    ## DELETE WEIGHT 0 ED
    for edge in g.get_edges():
        e = g.edge(edge[0], edge[1])
        w = weight[e]
        if w == 0:
            edges_to_delete.append(edge)
            v1 = g.get_all_neighbors(edge[0])
            v2 = g.get_all_neighbors(edge[1])

            if (len(v1)) == 1:
                to_delete.append(edge[0])
            if (len(v2)) == 1:
                to_delete.append(edge[1])

    for i in reversed(sorted(to_delete)):
        g.clear_vertex(i)
        g.remove_vertex(i)

    to_delete = []
    vertices = g.get_vertices()

    for v in vertices:
        vecinos = g.get_out_neighbors(v)
        if len(vecinos) == 2:
            edge = g.edge(vecinos[0], vecinos[1])

            if edge is None:
                edge = g.add_edge(vecinos[0], vecinos[1])
                ed1 = g.edge(v, vecinos[0])
                w1 = weight[ed1]
                ed2 = g.edge(v, vecinos[1])
                w2 = weight[ed2]

                weight[edge] = w1 + w2

                if w1 == 0:
                    clase[edge] = clase[ed2]
                    ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0]
                elif w2 == 0:
                    clase[edge] = clase[ed1]
                    ske2[np.where(ske2 == clase[ed2][0])] == clase[ed1][0]
                else:
                    clase[edge] = clase[ed2]
                    ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0]

                g.remove_edge(ed1)
                g.remove_edge(ed2)
                to_delete.append(v)

    for i in reversed(sorted(to_delete)):
        g.clear_vertex(i)
        g.remove_vertex(i)

    vertices = g.get_vertices()

    pos_vertex = []
    for i in vertices:
        pos_vertex.append(pos[i])
    pos_vertex = np.array(pos_vertex)

    pares = []

    for i in vertices:
        d = find_dists(i, pos, pos_vertex)
        mask = np.ones(pos_vertex.shape[0], bool)
        mask[i] = False

        pair = np.zeros(pos_vertex.shape[0], bool)
        pair[mask] = d[mask] < 3
        c = np.count_nonzero(pair)

        if c == 1:
            k = np.where(pair == True)[0][0]
            if [k, i] not in pares:
                pares.append([i, k])

    to_delete = []

    for par in pares:
        v1 = par[0]
        v2 = par[1]
        if g.edge(v1, v2):
            if weight[g.edge(v1, v2)] == 0:
                vecinos2 = g.get_all_neighbors(v2)

                for k in vecinos2:
                    if k != v1:
                        edge = g.edge(v2, k)
                        w_e = weight[edge]
                        c_e = clase[edge]

                        n_edge = g.add_edge(v1, k)
                        weight[n_edge] = w_e
                        clase[n_edge] = c_e

                g.clear_vertex(v2)
                to_delete.append(v2)

    for i in reversed(sorted(to_delete)):
        g.clear_vertex(i)
        g.remove_vertex(i)

    return [g, pos, weight, clase, nodetype, age], ske, ske2


# import graph_tool.all as gt
# import cv2

# from rsmlFunc import createTree
# from imageFunc import getCleanSke
# from graphFunc import createGraph
# from trackFunc import matchGraphs

# conf = {}

# exec(open('/home/ncaggion/Escritorio/pRAnalyzer/confs/config.conf').read(),conf)

# for image in range(1214,1250):
#     g = gt.load_graph('/home/ncaggion/Escritorio/aux/graph_%s.xml.gz' %image)
#     pos = g.vertex_properties["pos"]
#     nodetype = g.vertex_properties["nodetype"]
#     age = g.vertex_properties["age"]
#     weight = g.edge_properties["weight"]
#     clase = g.edge_properties["clase"]

#     grafo1 = [g, pos, weight, clase, nodetype, age ]

#     seg = cv2.imread("/home/ncaggion/Escritorio/Test/Results 4/Imagenes/out_%s_2.png" %image, 0)
#     ske, bnodes, enodes, _ = getCleanSke(seg)
#     grafo2, seed, ske2 = createGraph(ske.copy(), pos[0], enodes, bnodes)

#     grafo2, ske, ske2 = trimGraph(grafo2, ske, ske2)
#     grafo2 = matchGraphs(grafo1, grafo2)

#     rsmlTree, numberLR = createTree(conf, 0, ["/home/ncaggion/Escritorio/Paper/Figura1/2020-03-31_12-15-18_2.png"], grafo2, ske, ske2)
#     rsmlTree.write(open('/home/ncaggion/Escritorio/aux/rsml_%s.rsml' %image, 'w'), encoding='unicode')

# gt.graph_draw(g, pos = pos)
Example #15
0
import networkx as nx

filenames = sorted(glob.glob("./Simulatednetworks/SimulatedGraphWeek*.graphml"))
FakeGraphs = [gt.load_graph(File) for File in filenames]
filenames = sorted(glob.glob("./Realnetworks/tags*_2015.gml"))
Graphs = [gt.load_graph(File) for File in filenames]
print len(Graphs)
#"""
xGraphs = [nx.Graph() for i in xrange(len(Graphs))]
xFakeGraphs = [nx.Graph() for i in xrange(len(FakeGraphs))]
for i in xrange(len(Graphs)):
	gt.remove_self_loops(Graphs[i])
	for e in Graphs[i].edges():
		xGraphs[i].add_edge(*e)
for i in xrange(len(FakeGraphs)):
	gt.remove_parallel_edges(FakeGraphs[i])
	for e in FakeGraphs[i].edges():
		xFakeGraphs[i].add_edge(*e)
#print nx.rich_club_coefficient(xGraphs[0], normalized = False)
RealClubs = []
print len(xGraphs)
for Graph in xGraphs:
	Coefficients = nx.rich_club_coefficient(Graph, normalized = False)
	print len(Coefficients)
	Dummy = np.zeros((len(Coefficients),))
	for i in xrange(len(Dummy)):
		Dummy[i] = Coefficients[i]
	RealClubs.append(Dummy)
FakeClubs = []
for Graph in xFakeGraphs:
	Coefficients = nx.rich_club_coefficient(Graph, normalized = False)
Example #16
0
def trimGraph(grafo, ske, ske2):
    g, pos, weight, clase, nodetype, age = grafo

    edges_to_delete = []
    to_delete = []

    gt.remove_parallel_edges(g)

    ## DELETE WEIGHT 0 ED
    for edge in g.get_edges():
        e = g.edge(edge[0], edge[1])
        w = weight[e]
        if w == 0:
            edges_to_delete.append(edge)
            v1 = g.get_all_neighbors(edge[0])
            v2 = g.get_all_neighbors(edge[1])

            if (len(v1)) == 1:
                to_delete.append(edge[0])
            if (len(v2)) == 1:
                to_delete.append(edge[1])

    for i in reversed(sorted(to_delete)):
        g.clear_vertex(i)
        g.remove_vertex(i)

    to_delete = []
    vertices = g.get_vertices()

    for v in vertices:
        vecinos = g.get_out_neighbors(v)
        if len(vecinos) == 2:
            edge = g.edge(vecinos[0], vecinos[1])

            if edge is None:
                edge = g.add_edge(vecinos[0], vecinos[1])
                ed1 = g.edge(v, vecinos[0])
                w1 = weight[ed1]
                ed2 = g.edge(v, vecinos[1])
                w2 = weight[ed2]

                weight[edge] = w1 + w2

                if w1 == 0:
                    clase[edge] = clase[ed2]
                    ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0]
                elif w2 == 0:
                    clase[edge] = clase[ed1]
                    ske2[np.where(ske2 == clase[ed2][0])] == clase[ed1][0]
                else:
                    clase[edge] = clase[ed2]
                    ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0]

                g.remove_edge(ed1)
                g.remove_edge(ed2)
                to_delete.append(v)

    for i in reversed(sorted(to_delete)):
        g.clear_vertex(i)
        g.remove_vertex(i)

    vertices = g.get_vertices()

    pos_vertex = []
    for i in vertices:
        pos_vertex.append(pos[i])
    pos_vertex = np.array(pos_vertex)

    pares = []

    for i in vertices:
        d = find_dists(i, pos, pos_vertex)
        mask = np.ones(pos_vertex.shape[0], bool)
        mask[i] = False

        pair = np.zeros(pos_vertex.shape[0], bool)
        pair[mask] = d[mask] < 3
        c = np.count_nonzero(pair)

        if c == 1:
            k = np.where(pair == True)[0][0]
            if [k, i] not in pares:
                pares.append([i, k])

    to_delete = []

    for par in pares:
        v1 = par[0]
        v2 = par[1]
        if g.edge(v1, v2):
            if weight[g.edge(v1, v2)] == 0:
                vecinos2 = g.get_all_neighbors(v2)

                for k in vecinos2:
                    if k != v1:
                        edge = g.edge(v2, k)
                        w_e = weight[edge]
                        c_e = clase[edge]

                        n_edge = g.add_edge(v1, k)
                        weight[n_edge] = w_e
                        clase[n_edge] = c_e

                g.clear_vertex(v2)
                to_delete.append(v2)

    for i in reversed(sorted(to_delete)):
        g.clear_vertex(i)
        g.remove_vertex(i)

    return [g, pos, weight, clase, nodetype, age], ske, ske2
Example #17
0
            str(graph.num_edges()) + " edges")
        print("Average vertex degree (in+out) = " + str(avgdeg) +
              "  St. Dev = " + str(stddevdeg))
        print("Average edge weight = " + str(avgwt) + "  St. Dev = " +
              str(stddevwt))

        _, hist = gt.label_components(graph)
        print("Strongly connected components: " + str(len(hist)))

        # Make the graph undirected (for weak components, and a simpler drawing)
        graph.set_directed(False)
        _, hist = gt.label_components(graph)
        print("Weakly connected components: " + str(len(hist)))

        # Plotting the graph
        gt.remove_parallel_edges(graph)  # Removing any superfluous edges

        edgeweights = graph.edge_properties["weight"]
        colorprops = graph.new_vertex_property("string")
        colorprops2 = graph.new_vertex_property("vector<float>")
        shapeprops = graph.new_vertex_property("string")
        vertexsize = graph.new_vertex_property("double")

        start = -1
        goal = -1

        for v in range(graph.num_vertices()):
            # Color and size vertices by type: start, goal, other
            if pd.isStartVertex(v):
                start = v
                colorprops[graph.vertex(v)] = "cyan"
Example #18
0
def load_ply_layout(file):
    g = gt.Graph(directed=False)

    with open(file) as f:
        all_lines = f.read().splitlines()
        it = iter(all_lines)

        line = next(it)
        assert (line == 'ply')

        line = next(it)
        assert (line.startswith('format ascii'))

        line = next(it)
        while not line.startswith('element'):
            line = next(it)

        words = line.split(' ')
        assert (words[0] == 'element')
        assert (words[1] == 'vertex')
        assert (words[2].isdigit())
        n_vertices = int(words[2])
        g.add_vertex(n_vertices)
        assert (g.num_vertices() == n_vertices)

        line = next(it)
        v_props = OrderedDict()
        while line.startswith('property'):
            words = line.split(' ')
            the_type = words[1]
            if the_type == 'list':
                name = words[4]
                v_props[name] = dict()
                count_type = words[2]
                entry_type = words[3]
                v_props[name]['count_type'] = count_type
                v_props[name]['entry_type'] = entry_type
            else:
                name = words[2]
                v_props[name] = dict()
            v_props[name]['type'] = the_type
            line = next(it)
        print(v_props)

        vps = dict()
        for i, v_prop in enumerate(v_props):
            name = list(v_props.keys())[i]
            the_type = v_props[name]['type']
            if the_type == 'float':
                vp = g.new_vp(the_type)
                vps[name] = vp
            else:
                raise NotImplementedError()

        print(vps)
        assert ('x' in vps.keys())
        assert ('y' in vps.keys())
        assert ('z' in vps.keys())

        # Scan to next element
        while not line.startswith('element'):
            line = next(it)

        words = line.split(' ')
        assert (words[0] == 'element')
        assert (words[1] == 'face')
        assert (words[2].isdigit())
        n_faces = int(words[2])
        print(n_faces)

        line = next(it)
        f_props = OrderedDict()
        while line.startswith('property'):
            words = line.split(' ')
            the_type = words[1]
            if the_type == 'list':
                name = words[4]
                f_props[name] = dict()
                count_type = words[2]
                entry_type = words[3]
                f_props[name]['count_type'] = count_type
                f_props[name]['entry_type'] = entry_type
            else:
                name = words[2]
                f_props[name] = dict()
            f_props[name]['type'] = the_type
            line = next(it)
        print(f_props)

        while not line.startswith('end_header'):
            line = next(it)

        for i in range(n_vertices):
            line = next(it)
            words = line.split(' ')
            words = [word for word in words if word != '']
            assert (len(words) == len(v_props.keys()))
            for j, word in enumerate(words):
                name = list(v_props.keys())[j]
                the_type = v_props[name]['type']
                if the_type == 'float':
                    vps[name][i] = float(word)
                else:
                    raise NotImplementedError

        for _ in range(n_faces):
            line = next(it)
            words = line.split(' ')
            words = [word for word in words if word != '']
            i = 0
            for name in f_props.keys():
                the_type = f_props[name]['type']
                if the_type == 'list':
                    if f_props[name]['count_type'] == 'uchar':
                        n_items = int(words[i])
                    else:
                        raise NotImplementedError
                    the_list = [
                        int(word) for word in words[i + 1:i + 1 + n_items]
                    ]
                    i += 1 + n_items

                    if name == 'vertex_indices':
                        for j, idx1 in enumerate(the_list):
                            idx2 = the_list[(j + 1) % len(the_list)]
                            g.add_edge(idx1, idx2)
            assert (i == len(words))

    gt.remove_parallel_edges(g)

    largest_connected_component = gt.label_largest_component(g)
    unreferenced = sum([1 for i in largest_connected_component.a if i == 0])
    if unreferenced > 0:
        g.set_vertex_filter(largest_connected_component)
        g.purge_vertices()
        print('Filtered {0} unreferenced vertices.'.format(unreferenced))

    if 'x' in vps.keys() and 'y' in vps.keys():
        if 'z' in vps.keys():
            Y = np.zeros((n_vertices, 3))
            for v in g.vertices():
                print(type(v))
                Y[v, 0] = vps['x'][v]
                Y[v, 1] = vps['y'][v]
                Y[v, 2] = vps['z'][v]
        else:
            Y = np.zeros((n_vertices, 2))
            for v in g.vertices():
                Y[v, 0] = vps['x'][v]
                Y[v, 1] = vps['y'][v]

    return g, Y
Example #19
0
def load_vna_layout(in_file):
    with open(in_file) as f:
        all_lines = f.read().splitlines()

        it = iter(all_lines)

        # Ignore preamble
        line = next(it)
        while not line.startswith('*Node properties'):
            line = next(it)

        node_properties = next(it).split(' ')
        assert ('ID' in node_properties and 'x' in node_properties
                and 'y' in node_properties)

        vertices = dict()
        line = next(it)
        gt_idx = 0  # Index for gt
        while not line.startswith('*Tie data'):
            entries = line.split(' ')
            vna_id = entries[0]
            vertex = dict()
            for i, prop in enumerate(node_properties):
                vertex[prop] = entries[i]
            vertex['ID'] = gt_idx  # Replace VNA ID by numerical gt index
            vertices[
                vna_id] = vertex  # Retain VNA ID as key of the vertices dict

            gt_idx += 1
            line = next(it)

        edge_properties = next(it).split(' ')
        assert (edge_properties[0] == 'from' and edge_properties[1] == 'to')

        edges = []
        try:
            while True:
                line = next(it)
                entries = line.split(' ')
                v_i = vertices[entries[0]]['ID']
                v_j = vertices[entries[1]]['ID']
                edges.append((v_i, v_j))
        except StopIteration:
            pass

        g = gt.Graph(directed=False)
        g.add_vertex(len(vertices))
        for v_i, v_j in edges:
            g.add_edge(v_i, v_j)

        gt.remove_parallel_edges(g)

        Y = np.zeros((g.num_vertices(), 2))
        for v in vertices.keys():
            Y[vertices[v]['ID'], 0] = float(vertices[v]['x'])
            Y[vertices[v]['ID'], 1] = float(vertices[v]['y'])
        pos = g.new_vertex_property('vector<double>')
        pos.set_2d_array(Y.T)

        return g, Y
    return None
Example #20
0
def nested_model_multi(
    adatas: List[AnnData],
    deg_corr: bool = True,
    tolerance: float = 1e-6,
    n_sweep: int = 10,
    beta: float = np.inf,
    samples: int = 100,
    collect_marginals: bool = True,
    n_jobs: int = -1,
    *,
    random_seed: Optional[int] = None,
    key_added: str = 'multi_nsbm',
    adjacency: Optional[List[sparse.spmatrix]] = None,
    neighbors_key: Optional[List[str]] = ['neighbors'],
    directed: bool = False,
    use_weights: bool = False,
    save_model: Union[str, None] = None,
    copy: bool = False,
    #    minimize_args: Optional[Dict] = {},
    dispatch_backend: Optional[str] = 'processes',
    #    equilibrate_args: Optional[Dict] = {},
) -> Optional[List[AnnData]]:
    """\
    Cluster cells into subgroups using multiple modalities.

    Cluster cells using the nested Stochastic Block Model [Peixoto14]_,
    performing Bayesian inference on node groups. This function takes multiple
    experiments, possibly across different modalities, and perform joint
    clustering.
    

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first. It also requires cells having the same
    names if coming from paired experiments

    Parameters
    ----------
    adatas
        A list of processed AnnData. Neighbors must have been already
        calculated.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    tolerance
        Tolerance for fast model convergence.
    n_sweep 
        Number of iterations to be performed in the fast model MCMC greedy approach
    beta
        Inverse temperature for MCMC greedy approach    
    samples
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    n_jobs
        Number of parallel computations used during model initialization
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`. If all AnnData share the same key, one
        only has to be specified, otherwise the full tuple of all keys must 
        be provided
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell. 
    `adata.uns['schist']['multi_level_params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['schist']['multi_level_stats']`
        A dict with the values returned by mcmc_sweep
    `adata.obsm['CA_multi_nsbm_level_{n}']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['schist']['multi_level_state']`
        The NestedBlockModel state object
    """

    if random_seed:
        np.random.seed(random_seed)

    seeds = np.random.choice(range(samples**2), size=samples, replace=False)

    if collect_marginals and samples < 100:
        logg.warning(
            'Collecting marginals requires sufficient number of samples\n'
            f'It is now set to {samples} and should be at least 100')

    start = logg.info('minimizing the nested Stochastic Block Model')

    if copy:
        adatas = [x.copy() for x in adatas]

    n_keys = len(neighbors_key)
    n_data = len(adatas)
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        adjacency = []
        if n_keys > 1 and n_keys < n_data:
            raise ValueError(
                'The number of neighbors keys does not match'
                'the number of data matrices. Either fix this'
                'or pass a neighbor key that is shared across all modalities')
        if n_keys == 1:
            neighbors_key = [neighbors_key[0] for x in range(n_data)]
        for x in range(n_data):
            logg.info(f'getting adjacency for data {x}', time=start)
            if neighbors_key[x] not in adatas[x].uns:
                raise ValueError('You need to run `pp.neighbors` first '
                                 'to compute a neighborhood graph. for'
                                 f'data entry {x}')
            elif 'connectivities_key' in adatas[x].uns[neighbors_key[x]]:
                # scanpy>1.4.6 has matrix in another slot
                conn_key = adatas[x].uns[
                    neighbors_key[x]]['connectivities_key']
                adjacency.append(adatas[x].obsp[conn_key])
            else:
                # scanpy<=1.4.6 has sparse matrix here
                adjacency.append(
                    adatas[x].uns[neighbors_key[x]]['connectivities'])

    # convert it to igraph and graph-tool

    graph_list = []
    for x in range(n_data):
        g = get_igraph_from_adjacency(adjacency[x], directed=directed)
        g = g.to_graph_tool()
        gt.remove_parallel_edges(g)
        # add cell names to graph, this will be used to create
        # layered graph
        g_names = g.new_vertex_property('string')
        d_names = adatas[x].obs_names
        for xn in range(len(d_names)):
            g_names[xn] = d_names[xn]
        g.vp['cell'] = g_names
        graph_list.append(g)

# skip weights for now
#    recs=[]
#    rec_types=[]
#    if use_weights:
# this is not ideal to me, possibly we may need to transform
# weights. More tests needed.
#        recs=[g.ep.weight]
#        rec_types=['real-normal']

# get a non-redundant list of all cell names across all modalities
    all_names = set(adatas[0].obs_names)
    [all_names.update(adatas[x].obs_names) for x in range(1, n_data)]
    all_names = list(all_names)
    # create the shared graph
    union_g = gt.Graph(directed=False)
    union_g.add_vertex(len(all_names))
    u_names = union_g.new_vertex_property('string')
    for xn in range(len(all_names)):
        u_names[xn] = all_names[xn]
    union_g.vp['cell'] = u_names

    # now handle in a non elegant way the index mapping across all
    # modalities and the unified Graph

    u_cell_index = dict([(union_g.vp['cell'][x], x)
                         for x in range(union_g.num_vertices())])
    # now create layers
    layer = union_g.new_edge_property('int')
    for ng in range(n_data):
        for e in graph_list[ng].edges():
            S, T = e.source(), e.target()
            Sn = graph_list[ng].vp['cell'][S]
            Tn = graph_list[ng].vp['cell'][T]
            Sidx = u_cell_index[Sn]
            Tidx = u_cell_index[Tn]
            ne = union_g.add_edge(Sidx, Tidx)
            layer[ne] = ng + 1  # this is the layer label

    union_g.ep['layer'] = layer
    # DONE! now proceed with standard minimization, ish

    if samples < 1:
        samples = 1

    states = [
        gt.NestedBlockState(g=union_g,
                            base_type=gt.LayeredBlockState,
                            state_args=dict(deg_corr=deg_corr,
                                            ec=union_g.ep.layer,
                                            layers=True))
        for n in range(samples)
    ]

    def fast_min(state, beta, n_sweep, fast_tol, seed=None):
        if seed:
            gt.seed_rng(seed)
        dS = 1
        while np.abs(dS) > fast_tol:
            dS, _, _ = state.multiflip_mcmc_sweep(beta=beta,
                                                  niter=n_sweep,
                                                  c=0.5)
        return state

    states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
        delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x])
        for x in range(samples))
    logg.info('        minimization step done', time=start)
    pmode = gt.PartitionModeState([x.get_bs() for x in states],
                                  converge=True,
                                  nested=True)
    bs = pmode.get_max_nested()
    logg.info('        consensus step done', time=start)

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    # prune redundant levels at the top
    bs = [x for x in bs if len(np.unique(x)) > 1]
    bs.append(np.array([0],
                       dtype=np.int32))  #in case of type changes, check this
    state = gt.NestedBlockState(union_g,
                                bs=bs,
                                base_type=gt.LayeredBlockState,
                                state_args=dict(deg_corr=deg_corr,
                                                ec=union_g.ep.layer,
                                                layers=True))

    logg.info('    done', time=start)
    u_groups = np.unique(bs[0])
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1

    if collect_marginals:
        # note that the size of this will be equal to the number of the groups in Mode
        # but some entries won't sum to 1 as in the collection there may be differently
        # sized partitions
        pv_array = pmode.get_marginal(union_g).get_2d_array(
            range(last_group)).T[:, u_groups] / samples

    groups = np.zeros((union_g.num_vertices(), len(bs)), dtype=int)

    for x in range(len(bs)):
        # for each level, project labels to the vertex level
        # so that every cell has a name. Note that at this level
        # the labels are not necessarily consecutive
        groups[:, x] = state.project_partition(x, 0).get_array()

    groups = pd.DataFrame(groups).astype('category')

    # rename categories from 0 to n
    for c in groups.columns:
        ncat = len(groups[c].cat.categories)
        new_cat = [u'%s' % x for x in range(ncat)]
        groups[c].cat.rename_categories(new_cat, inplace=True)

    levels = groups.columns

    # recode block names to have consistency with group names
    i_groups = groups.astype(int)
    bs = [i_groups.iloc[:, 0].values]
    for x in range(1, groups.shape[1]):
        bs.append(
            np.where(
                pd.crosstab(i_groups.iloc[:, x - 1], i_groups.iloc[:,
                                                                   x]) > 0)[1])
    state = gt.NestedBlockState(union_g, bs)
    del (i_groups)

    groups.index = all_names

    # add column names
    groups.columns = [f"{key_added}_level_{level}" for level in range(len(bs))]

    # remove any column with the same key
    for xn in range(n_data):
        drop_columns = groups.columns.intersection(adatas[xn].obs.columns)
        adatas[xn].obs.drop(drop_columns, 'columns', inplace=True)
        adatas[xn].obs = pd.concat(
            [adatas[xn].obs, groups.loc[adatas[xn].obs_names]], axis=1)

        # now add marginal probabilities.

        if collect_marginals:
            # add marginals for level 0, the sum up according to the hierarchy
            _groups = groups.loc[adatas[xn].obs_names]
            _pv_array = pd.DataFrame(
                pv_array, index=all_names).loc[adatas[xn].obs_names].values
            adatas[xn].obsm[f"CM_{key_added}_level_0"] = _pv_array
            for group in groups.columns[1:]:
                ct = pd.crosstab(_groups[_groups.columns[0]],
                                 _groups[group],
                                 normalize='index',
                                 dropna=False)
                adatas[xn].obsm[f'CM_{group}'] = _pv_array @ ct.values

        # add some unstructured info
        if not 'schist' in adatas[xn].uns:
            adatas[xn].uns['schist'] = {}

        adatas[xn].uns['schist'][f'{key_added}'] = {}
        adatas[xn].uns['schist'][f'{key_added}']['stats'] = dict(
            level_entropy=np.array(
                [state.level_entropy(x) for x in range(len(state.levels))]),
            modularity=np.array([
                gt.modularity(union_g, state.project_partition(x, 0))
                for x in range(len((state.levels)))
            ]))

        bl_d = {}
        levels = state.get_levels()
        for nl in range(len(levels)):
            bl_d[str(nl)] = np.array(levels[nl].get_blocks().a)
        adatas[xn].uns['schist'][f'{key_added}']['blocks'] = bl_d

        # last step is recording some parameters used in this analysis
        adatas[xn].uns['schist'][f'{key_added}']['params'] = dict(
            model='multiome_nested',
            use_weights=use_weights,
            neighbors_key=neighbors_key[xn],
            key_added=key_added,
            samples=samples,
            collect_marginals=collect_marginals,
            random_seed=random_seed,
            deg_corr=deg_corr,
            #            recs=recs,
            #            rec_types=rec_types
        )

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adatas if copy else None
Example #21
0
def state_from_blocks(
    adata: AnnData,
    state_key: Optional[str] = 'nsbm',
    neighbors_key: Optional[str] = 'neighbors',
    adjacency: Optional[spmatrix] = None,
    directed: bool = False,
    use_weights: bool = False,
    deg_corr: bool = True,
):
    """
    Returns a gt state object given an AnnData

    Parameters
    ----------
    adata
        The annotated data matrix.
    state_key
        The key under which the state has been saved
    neighbors_key
        The key passed to `sc.pp.neighbors`
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
        
    Returns
    -------
    
    Nothing, adds a `gt.block_state` object in adata.uns        
        
    """
    bl_d = adata.uns['schist'][f'{state_key}']['blocks']
    params = adata.uns['schist'][f'{state_key}']['params']
    if params['model'] == 'nested' or params['model'] == 'multiome_nested':
        blocks = []
        for nl in range(len(bl_d)):
            blocks.append(bl_d[str(nl)])
    else:
        blocks = bl_d['0']
    
    if 'deg_corr' in params:
        deg_corr=params['deg_corr']

    recs=[]
    rec_types=[]
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs=[g.ep.weight]
        rec_types=['real-normal']
        
    if 'recs' in params:
        recs=params['recs']
    if 'rec_types' in params:
        rec_types=params['rec_types']
            
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']

    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    if params['model'] == 'flat':
        state = gt.BlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    elif params['model'] == 'ppbm':
        state = gt.PPBlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    else:
        state = gt.NestedBlockState(g, bs=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    return state            
    
Example #22
0
def leiden(
    adata: AnnData,
    resolution: float = 1,
    samples: int = 100,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_state: _utils.AnyRandom = 0,
    key_added: str = 'leiden',
    adjacency: Optional[sparse.spmatrix] = None,
    directed: bool = True,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    neighbors_key: Optional[str] = None,
    obsp: Optional[str] = None,
    collect_marginals: bool = True,
    n_jobs: int = -1,
    copy: bool = False,
    save_model: Union[str, None] = None,
    dispatch_backend: Optional[str] = 'processes',
    **partition_kwargs,
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Traag18]_.

    Cluster cells using the Leiden algorithm [Traag18]_,
    an improved version of the Louvain algorithm [Blondel08]_.
    It has been proposed for single-cell analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.


    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        A parameter value controlling the coarseness of the clustering.
        Higher values lead to more clusters.
        Set to `None` if overriding `partition_type`
        to one that doesn’t accept a `resolution_parameter`.
	samples
    samples
	The number of random samples to take for consensus        
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain `(obs_key, list_of_categories)`.
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    n_iterations
        How many iterations of the Leiden clustering algorithm to perform.
        Positive values above 2 define the total number of iterations to perform,
        -1 has the algorithm run until it reaches its optimal clustering.
    partition_type
        Type of partition to use.
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`.
        For the available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    obsp
        Use .obsp[obsp] as adjacency. You can't specify both
        `obsp` and `neighbors_key` at the same time.
    collect_marginals
    	Wheter to retrieve the marginal probability to belong to a group
    n_jobs
        Number of parallel jobs to calculate partitions
    copy
        Whether to copy `adata` or modify it inplace.
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    **partition_kwargs
        Any further arguments to pass to `~leidenalg.find_partition`
        (which in turn passes arguments to the `partition_type`).


    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['leiden']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    """
    try:
        import leidenalg
    except ImportError:
        raise ImportError(
            'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.'
        )
    partition_kwargs = dict(partition_kwargs)

    start = logg.info('running Leiden clustering')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        adjacency = _choose_graph(adata, obsp, neighbors_key)
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g_gt = g.to_graph_tool()
    gt.remove_parallel_edges(g_gt)
    # flip to the default partition type if not overriden by the user
    if partition_type is None:
        partition_type = leidenalg.RBConfigurationVertexPartition
    # Prepare find_partition arguments as a dictionary,
    # appending to whatever the user provided. It needs to be this way
    # as this allows for the accounting of a None resolution
    # (in the case of a partition variant that doesn't take it on input)
    if use_weights:
        partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
    partition_kwargs['n_iterations'] = n_iterations
    np.random.seed(random_state)
    seeds = np.random.choice(range(0, samples**2), size=samples, replace=False)
    

    if resolution is not None:
        partition_kwargs['resolution_parameter'] = resolution
    # clustering proper
    def membership(g, partition_type, seed, **partition_kwargs):
        return leidenalg.find_partition(g, partition_type, 
                                        seed=seed, **partition_kwargs).membership
    
    parts = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
                    delayed(membership)(g, partition_type, 
                                        seeds[i], **partition_kwargs) 
                                        for i in range(samples))

    pmode = gt.PartitionModeState(parts, converge=True) 

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')    
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    groups = np.array(pmode.get_max(g_gt).get_array())     
    u_groups = np.unique(groups)
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1
    if collect_marginals:
        pv_array = pmode.get_marginal(g_gt).get_2d_array(range(last_group)).T[:, u_groups] / samples
    # rename groups to ensure they are a continuous range
    rosetta = dict(zip(u_groups, range(len(u_groups))))
    groups = np.array([rosetta[x] for x in groups])

    # store output into adata.obs
        
    if restrict_to is not None:
        if key_added == 'leiden':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(map(str, np.unique(groups))),
    )
    if collect_marginals:
        adata.obsm[f"CM_{key_added}"] = pv_array
    # store information on the clustering parameters
    adata.uns['leiden'] = {}
    adata.uns['leiden']['params'] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
        samples=samples,
        collect_marginals=collect_marginals
    )
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        ),
    )
    return adata if copy else None
Example #23
0
def planted_model(
    adata: AnnData,
    n_sweep: int = 10,
    beta: float = np.inf,
    tolerance=1e-6,
    collect_marginals: bool = True,
    deg_corr: bool = True,
    samples: int = 100,
    n_jobs: int = -1,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'ppbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    save_model: Union[str, None] = None,
    #    minimize_args: Optional[Dict] = {},
    dispatch_backend: Optional[str] = 'processes',
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Planted Partition Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. This function, in particular, uses
    the Planted Block Model, which is particularly suitable in case of
    assortative graphs and it returns the optimal number of communities

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    n_sweep
        Number of MCMC sweeps to get the initial guess
    beta
        Inverse temperature for the initial MCMC sweep        
    tolerance
        Difference in description length to stop MCMC sweep iterations        
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    samples
        Number of initial minimizations to be performed. This influences also the 
        precision for marginals
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    random_seed
        Random number to be used as seed for graph-tool
    n_jobs
        Number of parallel computations used during model initialization

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['schist']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['schist']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.obsm['CM_ppbm']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['schist']['state']`
        The BlockModel state object
    """

    if random_seed:
        np.random.seed(random_seed)

    seeds = np.random.choice(range(samples**2), size=samples, replace=False)

    if collect_marginals and samples < 100:
        logg.warning(
            'Collecting marginals requires sufficient number of samples\n'
            f'It is now set to {samples} and should be at least 100')

    start = logg.info('minimizing the Planted Partition Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph and graph-tool
    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if samples < 1:
        samples = 1

    # initialize  the block states
    def fast_min(state, beta, n_sweep, fast_tol, seed=None):
        if seed:
            gt.seed_rng(seed)
        dS = 1
        while np.abs(dS) > fast_tol:
            dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep)
        return state

    states = [gt.PPBlockState(g) for x in range(samples)]

    # perform a mcmc sweep on each
    # no list comprehension as I need to collect stats

    states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
        delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x])
        for x in range(samples))
    logg.info('        minimization step done', time=start)
    pmode = gt.PartitionModeState([x.get_blocks().a for x in states],
                                  converge=True)

    bs = pmode.get_max(g)
    logg.info('        consensus step done', time=start)

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    state = gt.PPBlockState(g, b=bs)
    logg.info('    done', time=start)

    groups = np.array(bs.get_array())
    u_groups = np.unique(groups)
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1
    if collect_marginals:
        pv_array = pmode.get_marginal(g).get_2d_array(
            range(last_group)).T[:, u_groups] / samples

    rosetta = dict(zip(u_groups, range(len(u_groups))))
    groups = np.array([rosetta[x] for x in groups])

    if restrict_to is not None:
        if key_added == 'ppbm':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )

    # add column names
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(map(str, np.unique(groups))),
    )

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.obsm[f"CM_{key_added}"] = pv_array

    # add some unstructured info
    if not 'schist' in adata.uns:
        adata.uns['schist'] = {}

    adata.uns['schist'][f'{key_added}'] = {}
    adata.uns['schist'][f'{key_added}']['stats'] = dict(
        entropy=state.entropy(),
        modularity=gt.modularity(g, state.get_blocks()))

    # record state as list of blocks
    # for compatibility with nested model, use a dictionary with a single key here
    # although a np.array would be ok
    adata.uns['schist'][f'{key_added}']['blocks'] = {
        '0': np.array(state.get_blocks().a)
    }

    # last step is recording some parameters used in this analysis
    adata.uns['schist'][f'{key_added}']['params'] = dict(
        model='planted',
        use_weights=use_weights,
        neighbors_key=neighbors_key,
        key_added=key_added,
        samples=samples,
        collect_marginals=collect_marginals,
        random_seed=random_seed,
        deg_corr=deg_corr,
        recs=recs,
        rec_types=rec_types)

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Example #24
0
import graph_tool.all as gt
import matplotlib.pyplot as plt
import numpy as np
import glob
import scipy.stats as sp
import scipy.optimize as optimize
import NodeProbs as NoP

filenames = sorted(glob.glob("./Realnetworks/tags*_2015.gml"))
Graphs = [gt.load_graph(File) for File in filenames]
filenames = sorted(glob.glob("./Simulatednetworks/SimulatedGraphWeek*.graphml"))
FakeGraphs = [gt.load_graph(File) for File in filenames]
#"""
#Degree-Distributions
for Graph in FakeGraphs:
	gt.remove_parallel_edges(Graph)
bins = np.geomspace(0.01,1, num = 30)
print bins
FakeDegrees = []
for i in xrange(len(FakeGraphs)):
	Dummy = FakeGraphs[i].get_out_degrees(FakeGraphs[i].get_vertices()[25+i:]).astype("float")
	#Dummy /= Dummy.max()
	FakeDegrees.extend(Dummy)
	#splicing to deal with the dead nodes
	#FakeDegrees[i]/=FakeDegrees[i].max()
RealDegrees = []
for i in xrange(len(Graphs)):
	dummy = Graphs[i].get_out_degrees(Graphs[i].get_vertices()).astype("float")
	#dummy /= dummy.max()
	RealDegrees.extend(dummy)
	#RealDegrees[i]/=RealDegrees[i].max()
Example #25
0
def useGraphTool(pd):
    # Extract the graphml representation of the planner data
    graphml = pd.printGraphML()
    f = open("graph.graphml", 'w')
    f.write(graphml)
    f.close()

    # Load the graphml data using graph-tool
    graph = gt.load_graph("graph.graphml", fmt="xml")
    edgeweights = graph.edge_properties["weight"]

    # Write some interesting statistics
    avgdeg, stddevdeg = gt.vertex_average(graph, "total")
    avgwt, stddevwt = gt.edge_average(graph, edgeweights)

    print("---- PLANNER DATA STATISTICS ----")
    print(
        str(graph.num_vertices()) + " vertices and " + str(graph.num_edges()) +
        " edges")
    print("Average vertex degree (in+out) = " + str(avgdeg) + "  St. Dev = " +
          str(stddevdeg))
    print("Average edge weight = " + str(avgwt) + "  St. Dev = " +
          str(stddevwt))

    _, hist = gt.label_components(graph)
    print("Strongly connected components: " + str(len(hist)))

    # Make the graph undirected (for weak components, and a simpler drawing)
    graph.set_directed(False)
    _, hist = gt.label_components(graph)
    print("Weakly connected components: " + str(len(hist)))

    # Plotting the graph
    gt.remove_parallel_edges(graph)  # Removing any superfluous edges

    edgeweights = graph.edge_properties["weight"]
    colorprops = graph.new_vertex_property("string")
    vertexsize = graph.new_vertex_property("double")

    start = -1
    goal = -1

    for v in range(graph.num_vertices()):
        # Color and size vertices by type: start, goal, other
        if pd.isStartVertex(v):
            start = v
            colorprops[graph.vertex(v)] = "cyan"
            vertexsize[graph.vertex(v)] = 10
        elif pd.isGoalVertex(v):
            goal = v
            colorprops[graph.vertex(v)] = "green"
            vertexsize[graph.vertex(v)] = 10
        else:
            colorprops[graph.vertex(v)] = "yellow"
            vertexsize[graph.vertex(v)] = 5

    # default edge color is black with size 0.5:
    edgecolor = graph.new_edge_property("string")
    edgesize = graph.new_edge_property("double")
    for e in graph.edges():
        edgecolor[e] = "black"
        edgesize[e] = 0.5

    # using A* to find shortest path in planner data
    if start != -1 and goal != -1:
        _, pred = gt.astar_search(graph, graph.vertex(start), edgeweights)

        # Color edges along shortest path red with size 3.0
        v = graph.vertex(goal)
        while v != graph.vertex(start):
            p = graph.vertex(pred[v])
            for e in p.out_edges():
                if e.target() == v:
                    edgecolor[e] = "red"
                    edgesize[e] = 2.0
            v = p

    pos = graph.new_vertex_property("vector<double>")
    for v in range(graph.num_vertices()):
        vtx = pd.getVertex(v)
        st = vtx.getState()
        pos[graph.vertex(v)] = [st[0], st[1]]

    # Writing graph to file:
    # pos indicates the desired vertex positions, and pin=True says that we
    # really REALLY want the vertices at those positions
    # gt.graph_draw(graph, pos=pos, vertex_size=vertexsize, vertex_fill_color=colorprops,
    #               edge_pen_width=edgesize, edge_color=edgecolor,
    #               output="graph.pdf")
    gt.graph_draw(graph, pos=pos, output="graph.pdf")
    print('\nGraph written to graph.pdf')

    graph.vertex_properties["pos"] = pos
    graph.vertex_properties["vsize"] = vertexsize
    graph.vertex_properties["vcolor"] = colorprops
    graph.edge_properties["esize"] = edgesize
    graph.edge_properties["ecolor"] = edgecolor

    graph.save("mgraph.graphml")
    print('\nGraph saved to mgraph.graphml')
Example #26
0
def calculate_affinity(adata: AnnData,
                       level: int = 1,
                       block_key: Optional[str] = 'nsbm',
                       group_by: Optional[str] = None,
                       state: Optional = None,
                       neighbors_key: Optional[str] = 'neighbors',
                       adjacency: Optional[sparse.spmatrix] = None,
                       directed: bool = False,
                       use_weights: bool = False,
                       obsp: Optional[str] = None,
                       back_prob: bool = False,
                       copy: bool = False) -> Optional[AnnData]:
    """\
    Calculate cell affinity given a partition scheme. It can be used for 
    partitions calculated using schist or for any partition scheme, given
    for example by cell annotations.
    
    Parameters
    ----------
    adata:
        The AnnData object. Should have been already processed with schist
    level:
        The level to calculate affinity. This parameter is effective
        only for Nested partitions
    block_key:
        The prefix for partitions. This parameter is ignored if the state
        is not gt.NestedBlockState
    group_by:
        The key for group names used for calculations. Setting this will override
        level and block_key. This is effective only for NestedBlockState partitions
    state:
        Optionally calculate affinities on this state.
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    copy:
        Return a new object or do everything in place
        

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with affinity values 
    in adata.obsm[f'CA_{block_key}_level_{level}']
        
"""

    matrix_key = f'CA_{block_key}_level_{level}'  # the default name of the matrix
    if group_by:
        logg.info(f'Calculating cell affinity to {group_by}')
    else:
        logg.info(f'Calculating cell affinity to level {level}')

    if not state:
        # if no state is provided, use the default to retrieve graph
        if 'schist' in adata.uns and 'blocks' in adata.uns['schist'][
                f'{block_key}']:
            params = adata.uns['schist'][f'{block_key}']['params']
            if 'neighbors_key' in params:
                neighbors_key = params['neighbors_key']
            if 'use_weights' in params:
                use_weights = params['use_weights']
            if 'deg_corr' in params:
                deg_corr = params['deg_corr']
            state = state_from_blocks(adata,
                                      state_key=block_key,
                                      neighbors_key=neighbors_key,
                                      adjacency=adjacency,
                                      directed=directed,
                                      use_weights=use_weights,
                                      deg_corr=deg_corr)
            g = state.g
        elif not neighbors_key:
            # no state and no adjacency provided, raise an error
            raise ValueError("A state or an adjacency matrix should be given"
                             "Otherwise a graph cannot be computed")
        else:
            # get the graph from the adjacency
            adjacency = _choose_graph(adata, obsp, neighbors_key)
            g = get_igraph_from_adjacency(adjacency, directed=directed)
            g = g.to_graph_tool()
            gt.remove_parallel_edges(g)
            state = gt.BlockState(g)
    else:
        g = state.g

    if group_by:
        matrix_key = f'CA_{group_by}'
        # if groups are given, we generate a new BlockState and work on that
        if group_by in adata.obs.columns and adata.obs[
                group_by].dtype.name == 'category':
            partitions = adata.obs[group_by].cat.codes.values
            state = gt.BlockState(g, b=partitions)
            if back_prob:
                ca_matrix = get_cell_back_p(state)
            else:
                ca_matrix = get_cell_loglikelihood(state, as_prob=True)
        else:
            raise ValueError(
                f"{group_by} should be a categorical entry in adata.obs")
    else:
        # use precomputed blocks and states
        if type(state) == gt.NestedBlockState:
            if back_prob:
                p0 = get_cell_back_p(state, level=0)
            else:
                p0 = get_cell_loglikelihood(state, level=0, as_prob=True)
            group_col = None
            if group_by and group_by in adata.obs.columns:
                group_col = group_by
            else:
                g_name = f'{block_key}_level_{level}'
                if g_name in adata.obs.columns:
                    group_col = g_name
            if not group_col:
                raise ValueError(
                    "The provided groups or level/blocks do not exist")

            g0 = pd.Categorical(state.project_partition(0, 0).a)
            cross_tab = pd.crosstab(g0,
                                    adata.obs[group_col],
                                    normalize='index')
            ca_matrix = (p0 @ cross_tab).values

        elif type(state) == gt.PPBlockState:
            if back_prob:
                ca_matrix = get_cell_back_p(state)
            else:
                ca_matrix = get_cell_loglikelihood(state, as_prob=True)
            matrix_key = 'CA_ppbm'

    adata.obsm[matrix_key] = ca_matrix

    return adata if copy else None