def top():
	g = gt.load_graph(filename)
	print 'Graph loaded, now calculating top nodes'
	vblocks = g.vp['blocks']
	largest_block = max(range(blocks), key=lambda b: len(gt.find_vertex(g, vblocks, b)))
	print 'Largest block is %d with %d nodes' % (largest_block, len(gt.find_vertex(g, vblocks, largest_block)))

	for tup in top_ids(g, largest_block):
		print tup
Ejemplo n.º 2
0
def main():
    input_fasta = sys.argv[3]
    K = int(sys.argv[1])
    x = float(sys.argv[2])

    ht = khmer.Nodegraph(K, x, 4)

    sparse_graph = gt.Graph()
    hashes = sparse_graph.new_vertex_property("long long")

    for n, record in enumerate(screed.open(input_fasta)):
        if n % 1000 == 0:
            print('...loaded and tagged {} sequences'.format(n),
                  file=sys.stderr)
        name = record.name
        sequence = record.sequence

        ht.consume_sequence_and_tag_with_labels(sequence, n)
        tags = ht.sweep_tag_neighborhood(sequence, 0)
        for i in range(len(tags) - 1):
            src = tags[i]
            dst = tags[i + 1]

            new = False

            srcv = gt.find_vertex(sparse_graph, hashes, src)
            if not srcv:
                srcv = sparse_graph.add_vertex()
                hashes[srcv] = src
                new = True
            else:
                srcv = srcv[0]

            dstv = gt.find_vertex(sparse_graph, hashes, dst)
            if not dstv:
                dstv = sparse_graph.add_vertex()
                hashes[dstv] = dst
                new = True
            else:
                dstv = dstv[0]

            if new:
                e = sparse_graph.add_edge(srcv, dstv)

    print('Sparse graph has {} nodes, {} edges'.format(
        sparse_graph.num_vertices(), sparse_graph.num_edges()))
    comp = gt.label_largest_component(sparse_graph, directed=False)
    #pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0))
    gt.graph_draw(sparse_graph,
                  output_size=(5000, 5000),
                  output=input_fasta + '_sparse.png')
    sparse_graph.set_vertex_filter(comp)
    gt.graph_draw(sparse_graph,
                  output_size=(5000, 5000),
                  output=input_fasta + '_sparse_comp.png')
Ejemplo n.º 3
0
 def closest_vert(self, sigma, zed):
     '''Return the vertices closer to a position
     `sigma`,`zed`
     '''
     dist = np.hypot(self.sigmas.fa - sigma, self.zeds.fa - zed)
     idx = np.argmin(dist)
     sigma, zed = self.sigmas.fa[idx], self.zeds.fa[idx]
     s_matches = gt.find_vertex(self.graph, self.sigmas, sigma)
     z_matches = gt.find_vertex(self.graph, self.zeds, zed)
     log.debug('Number of closest vertices found: %i, %i' %
               (len(s_matches), len(z_matches)))
     return [v for v in s_matches if v in z_matches][0]
Ejemplo n.º 4
0
def main():
    input_fasta = sys.argv[3]
    K = int(sys.argv[1])
    x = float(sys.argv[2])
    
    ht = khmer.new_hashbits(K, x, 4)

    sparse_graph = gt.Graph()
    hashes = sparse_graph.new_vertex_property("long long")


    for n, record in enumerate(screed.open(input_fasta)):
        if n % 1000 == 0:
            print >>sys.stderr, '...loaded and tagged {} sequences'.format(n)
        name = record.name
        sequence = record.sequence

        ht.consume_sequence_and_tag_with_labels(sequence, n)
        tags = ht.sweep_tag_neighborhood(sequence, 0)
        for i in xrange(len(tags) - 1):
            src = tags[i]
            dst = tags[i + 1]

            new = False

            srcv = gt.find_vertex(sparse_graph, hashes, src)
            if not srcv:
                srcv = sparse_graph.add_vertex()
                hashes[srcv] = src
                new = True
            else:
                srcv = srcv[0]

            dstv = gt.find_vertex(sparse_graph, hashes, dst)
            if not dstv:
                dstv = sparse_graph.add_vertex()
                hashes[dstv] = dst
                new = True
            else:
                dstv = dstv[0]

            if new:
                e = sparse_graph.add_edge(srcv, dstv)

    print 'Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges())
    comp = gt.label_largest_component(sparse_graph, directed=False)
    #pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0))
    gt.graph_draw(sparse_graph, output_size=(
        5000, 5000), output=input_fasta + '_sparse.png')
    sparse_graph.set_vertex_filter(comp)
    gt.graph_draw(sparse_graph, output_size=(
        5000, 5000), output=input_fasta + '_sparse_comp.png')
def top():
	g = gt.load_graph(filename)
	print 'Graph loaded, now calculating top nodes'
	vblocks = g.vp['blocks']
	blocks = sorted(range(nblocks), key=lambda b: len(gt.find_vertex(g, vblocks, b)), reverse=True)

	for block in blocks:
		print 'Block %d with %d nodes' % (block, len(gt.find_vertex(g, vblocks, block)))
		tups = top_ids(g, block, n)
		ids = [t[0] for t in tups]
		names = get_names(ids)

		for tup, name in zip(tups, names):
			print name, tup[0], tup[1]
Ejemplo n.º 6
0
 def closest_vert(self, sigma, zed):
     '''Return the vertices closer to a position
     `sigma`,`zed`
     '''
     dist = np.hypot(self.sigmas.fa - sigma,
                     self.zeds.fa - zed)
     idx = np.argmin(dist)
     sigma, zed = self.sigmas.fa[idx], self.zeds.fa[idx]
     s_matches = gt.find_vertex(self.graph,
                                self.sigmas, sigma)
     z_matches = gt.find_vertex(self.graph,
                                self.zeds, zed)
     log.debug('Number of closest vertices found: %i, %i'
               % (len(s_matches), len(z_matches)))
     return [v for v in s_matches if v in z_matches][0]
Ejemplo n.º 7
0
def continue_graph(g, pos, weight, clase, img, actual, padre, enodes, bnodes):
    # padre refiere al indice en el grafo de graph_tool
    # actual es la posición del nodo en la imagen
    global c
    
    hijos = vecinos(img, actual)
    
    for i in hijos:
        img, nodo, largo_arista = get_next_node(img, i, actual, hijos, 0)
               
        s = gt.find_vertex(g, pos,nodo)
        if s == []:
            s = g.add_vertex()
            pos[s] = nodo
        else:
            s = s[0]
            
        arista = g.add_edge(padre, s)

        weight[arista] = largo_arista
        clase[arista] = [c, 0]
        
        c = c+1
        
        if img[nodo[1],nodo[0]] == 1:
            img[nodo[1], nodo[0]] = c
            
        if nodo not in enodes.tolist():
            g, pos, weight = continue_graph(g, pos, weight, clase, img, nodo, s, enodes, bnodes)           

    return g, pos, weight
Ejemplo n.º 8
0
    def _map_vertexs(self, graph, seeds):
        components = [[
            gt.find_vertex(self._network, self._network.vp['hash'],
                           graph.vp['hash'][v])[0] for v in seed
        ] for seed in seeds]

        return components
def community():
	g = gt.load_graph(filename)
	print 'Graph loaded, now finding community'
	# state = gt.BlockState(g, B=blocks)
	# for i in xrange(iterations):
	# 	if i < iterations / 2:
	# 		gt.mcmc_sweep(state)
	# 	else:
	# 		gt.mcmc_sweep(state, beta=float('inf'))

	# g.vp['blocks'] = state.get_blocks()

	spins = {}
	if 'blocks' in g.vp:
		spins = {'spins': g.vp['blocks']}

	g.vp['blocks'] = gt.community_structure(g, n_iter=iterations, n_spins=blocks, **spins)

	if 'pos' in g.vp:
		gt.sfdp_layout(g, groups=g.vp['blocks'], pos=g.vp['pos'])

	for i in xrange(blocks):
		print '%d nodes in block %d' % (len(gt.find_vertex(g, g.vp['blocks'], i)), i)

	g.save(filename)
Ejemplo n.º 10
0
def generate_graph(arcos_fuertes, arcos_debiles):
    '''
    A los arcos fuertes se les asigna un peso 100 veces mayor que a los arcos débiles
    '''
    g = gt.Graph(directed=True)
    # etiqueta_nodo=g.new_vertex_property('string')
    g.vp['etiqueta_nodo'] = g.new_vertex_property('string')

    num_vertices = len(
        set(zip(*arcos_fuertes)[0]).union(
            set(zip(*arcos_fuertes)[1]).union(
                set(zip(*arcos_debiles)[0]).union(set(
                    zip(*arcos_debiles)[1])))))

    for i in range(num_vertices):
        u = g.add_vertex()
        g.vp['etiqueta_nodo'][u] = str(i)

    # etiqueta_arco=g.new_edge_property('string')
    g.ep["etiqueta_arco"] = g.new_edge_property('string')

    # peso_arco=g.new_edge_property('float')
    g.ep["peso_arco"] = g.new_edge_property('float')

    for j in arcos_fuertes:
        u = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[0]))
        v = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[1]))
        e = g.add_edge(u[0], v[0])

        a = 100 * np.random.rand()
        if (a < 10):
            a = 10 + (10 * np.random.rand())
        g.ep['etiqueta_arco'][e] = str(round(a, 1))
        g.ep['peso_arco'][e] = round(a, 1)

    for j in arcos_debiles:
        u = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[0]))
        v = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[1]))
        e = g.add_edge(u[0], v[0])

        a = np.random.rand()
        g.ep['etiqueta_arco'][e] = str(round(a, 1))
        g.ep['peso_arco'][e] = round(a, 1)

    # gt.graph_draw(g,vertex_text=etiqueta_nodo,edge_text=etiqueta_arco,vertex_size=8)
    return g
def top_ids(g, block, n=10):
	'''List the most central node ids in a given block'''
	vid = g.vp['id']
	vrank = g.vp['rank']
	vertices = gt.find_vertex(g, g.vp['blocks'], block)
	sorted_vertices = sorted(vertices, key=lambda v: vrank[v], reverse=True)
	mapped_vertices = imap(lambda v: (vid[v], vrank[v]), sorted_vertices)

	return take(n, mapped_vertices)
Ejemplo n.º 12
0
    def find_k_shortest_paths(self, k, verbose=True):
        """
        Finds k shortest paths to current target using Yen's Algorithm.

        Args:
            k (int): desired number of shortest pathways (ranked by cost)
            verbose (bool): whether to print all identified pathways to the console.

        Returns:
            [RxnPathway]: list of RxnPathway objects containing reactions traversed on
                each path.
        """
        g = self._g
        paths = []

        precursors_v = gt.find_vertex(g, g.vp["type"], 0)[0]
        target_v = gt.find_vertex(g, g.vp["type"], 3)[0]

        for num, path in enumerate(self._yens_ksp(g, k, precursors_v,
                                                  target_v)):
            rxns = []
            weights = []

            for step, v in enumerate(path):
                g.vp["path"][v] = True

                if (g.vp["type"][v] == 2
                    ):  # add rxn step if current node in path is a product
                    e = g.edge(path[step - 1], v)
                    g.ep["path"][
                        e] = True  # mark this edge as occurring on a path
                    rxns.append(g.ep["rxn"][e])
                    weights.append(g.ep["weight"][e])

            rxn_pathway = RxnPathway(rxns, weights)
            paths.append(rxn_pathway)

        if verbose:
            for path in paths:
                print(path, "\n")

        return paths
Ejemplo n.º 13
0
    def set_target(self, target):
        """
        Replaces network's current target phase with new target phase.

        Args:
            target (ComputedEntry): ComputedEntry-like object for new target phase.

        Returns:
            None
        """
        g = self._g

        if target in self._current_target:
            return
        else:
            self._current_target = {target}

        g.remove_vertex(gt.find_vertex(g, g.vp["type"], 3))
        new_target_entry = RxnEntries(self._current_target, "t")
        new_target_v = g.add_vertex()
        self._update_vertex_properties(
            g,
            new_target_v,
            {
                "entries": new_target_entry,
                "type": 3,
                "bool": True,
                "path": True,
                "chemsys": new_target_entry.chemsys,
            },
        )

        new_edges = []

        for v in gt.find_vertex(g, g.vp["type"], 2):  # search for all products
            if self._current_target.issubset(g.vp["entries"][v].entries):
                new_edges.append([v, new_target_v, 0, None, True,
                                  False])  # link all products to new target

        g.add_edge_list(
            new_edges,
            eprops=[g.ep["weight"], g.ep["rxn"], g.ep["bool"], g.ep["path"]])
Ejemplo n.º 14
0
    def get_vertex_and_create_if_not_exist(group_name: str, username: str):
        v = gt.find_vertex(graph, graph.vp.username, username)

        if not v:
            v = graph.add_vertex()
            graph.vp.group[v] = GROUPS.index(group_name)
            graph.vp.group_name[v] = group_name
            graph.vp.username[v] = username
            return v
        else:
            return v[0]
Ejemplo n.º 15
0
def v_id(name, g, v_name):
    """Return integer index of vertex with given `name`
    Args:
        name - name of vertex (str)
        g - graph tool graph
        v_name - vertex property array giving names for each vertex
                 as returned when calling g.add_edge_list with hashed=True
    """
    # obtain vertex object
    v = gt.find_vertex(g, v_name, name)[0]
    # get int index
    return g.vertex_index[v]
Ejemplo n.º 16
0
 def make_filter(self, concepts: List[NODE_T], fltr):
     fltr.a = False
     for n in concepts:
         v = gtall.find_vertex(
             self.graph,
             prop=self.graph.properties[('v', '_graphml_vertex_id')],
             match=str(n))[0]
         # v = qtqu.get_nodes_by_node_prop(self.graph, '_graphml_vertex_id', n)[0]
         fltr[v] = True
         for vn in v.out_neighbors():
             fltr[vn] = True
     return fltr
Ejemplo n.º 17
0
    def citation_success(self,yr,yd,perc):
        #create property map
        citation_success=self.citation.new_vertex_property("double")
        citation_success_perc=self.citation.new_vertex_property("bool")    
        perc_cuts=[]
            
        for y in yr:
            print y,'...'
            #find vertices
            y1_vertices = gt.find_vertex(self.citation,self.citation.vertex_properties['year'],y)
            y1yd_vertices = gt.find_vertex_range(self.citation,self.citation.vertex_properties['year'],[y,y+yd])
    
            #set vertex filter property
            print 'Set filter prop...'
            y1yd_filter_prop=self.citation.new_vertex_property("bool")
            y1_filter_prop=self.citation.new_vertex_property("bool")
            y1yd_filter_prop.a=False
            y1_filter_prop.a=False
            for v in y1yd_vertices:
                y1yd_filter_prop[v]=True
            for v in y1_vertices:
                y1_filter_prop[v]=True
    
            #calculate graph_view of the subgraph of y,y+yd
            print 'Calc graph view ...'
            sub_cite_degree = self.citation.new_vertex_property("double")
            self.citation.set_vertex_filter(y1yd_filter_prop)
            sub_cite_degree.fa = self.citation.degree_property_map('out').fa
            #there are a lot of zeros ... so the percentile percentage has to be quite high
            self.citation.set_vertex_filter(None)
            self.citation.set_vertex_filter(y1_filter_prop)
            tmp = sub_cite_degree.fa
            percentile_cut = numpy.percentile(tmp,perc)
            perc_cuts.append(percentile_cut)
            print 'Percentile cut is ',percentile_cut
            self.citation.set_vertex_filter(None)

        
            print 'Write success ...'

            #write number of citations and success bool after yd years
            self.citation.set_vertex_filter(y1_filter_prop)
            citation_success.fa = tmp.copy()
            print 'There are ',numpy.count_nonzero(tmp>percentile_cut),' nodes exceeding the percentile cut.'
            citation_success_perc.fa = (tmp > percentile_cut).copy()
            self.citation.set_vertex_filter(None)
        
            
        return citation_success, citation_success_perc,perc_cuts
Ejemplo n.º 18
0
    def build_result_graph(self, reactionlist, product, filename):
        g = gt.Graph()

        cofactors = [s.id for s in self.cofactor_list]
        generic_compound = [s.id for s in self.generic_compound_list]

        edges = []
        for reaction in self.model.reactions:
            for c, val in reaction.reactants.iteritems():
                if c not in cofactors and c not in generic_compound:
                    if val < 0:
                        edges.append((c, reaction.id))
                    else:
                        edges.append((reaction.id, c))

        ids = g.add_edge_list(edges, hashed=True, string_vals=True)

        g.vertex_properties["ids"] = ids
        g.vertex_properties["names"] = g.new_vertex_property("string")
        g.vertex_properties["color"] = g.new_vertex_property("string")
        g.vertex_properties["size"] = g.new_vertex_property("int")
        g.edge_properties["arrows"] = g.new_edge_property("string")

        for v in g.vertices():
            v_id = g.vp.ids[v]
            g.vp.names[v] = v_id
            g.vp.color[v] = "red" if v_id in reactionlist else "white"
            g.vp.size[v] = 10 if v_id in reactionlist else 3

        for e in g.edges():
            g.ep.arrows[e] = "none"

        root = gt.find_vertex(g, g.vertex_properties["names"], product)[0]
        pos = gt.radial_tree_layout(g, g.vertex_index[root])

        gt.graph_draw(g,
                      pos,
                      vertex_size=g.vp.size,
                      vertex_text=g.vp.names,
                      vertex_fill_color=g.vp.color,
                      edge_pen_width=0.5,
                      edge_end_marker=g.ep.arrows,
                      output=filename,
                      fit_view=True,
                      output_size=(10000, 10000))
Ejemplo n.º 19
0
for n, record in enumerate(screed.open(input_fasta)):
    if n % 1000 == 0:
        print >>sys.stderr, '...loaded and tagged {} sequences'.format(n)
    name = record.name
    sequence = record.sequence

    ht.consume_sequence_and_tag_with_labels(sequence, n)
    tags = ht.sweep_tag_neighborhood(sequence, 0)
    for i in xrange(len(tags) - 1):
        src = tags[i]
        dst = tags[i + 1]

        new = False

        srcv = gt.find_vertex(sparse_graph, hashes, src)
        if not srcv:
            srcv = sparse_graph.add_vertex()
            hashes[srcv] = src
            new = True
        else:
            srcv = srcv[0]

        dstv = gt.find_vertex(sparse_graph, hashes, dst)
        if not dstv:
            dstv = sparse_graph.add_vertex()
            hashes[dstv] = dst
            new = True
        else:
            dstv = dstv[0]
Ejemplo n.º 20
0
def find_system_by_name(name):
    systems = gt.find_vertex(g, g.vertex_properties['system_name'], name)
    if len(systems) < 1:
        bottle.abort(404, "No such system.")

    return vtx_to_json(systems[0])
Ejemplo n.º 21
0
def procuraVerticeNoGrafo(g, nome):
    r = gt.find_vertex(g, g.vp.vertex_name, nome)
    if (not r):
        return r
    else:
        return r[0]
Ejemplo n.º 22
0
        gprop_vcolour[v] = "blue"
    else:
        gprop_vcolour[v] = "white"

g.vertex_properties["vcolour"] = gprop_vcolour

# create numLinks edge property for g edges

eprop_numLinks = g.new_edge_property("int")

# creates the edges between nodes

for i in linkDict:
    for n in linkDict[i]:
        #print(i)
        vertex_i = gt.find_vertex(g, gprop_label, i)[0]
        #print(n)
        try:
            vertex_n = gt.find_vertex(g, gprop_label, n)[0]
            e = g.add_edge(vertex_i,vertex_n)
            eprop_numLinks[e] = linkDict[i][n]
        except:
            IndexError

###### EXPERIMENTAL SIZE THINGS ######

# gvprop_size = g.new_vertex_property('float')

deleteList = []

for v in g.vertices():
Ejemplo n.º 23
0
Archivo: paths.py Proyecto: yyht/kgtk
def run(input_file: KGTKFiles, path_file, output_stats, directed, max_hops):
    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        from kgtk.exceptions import KGTKException
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        from graph_tool.all import find_vertex
        from graph_tool.topology import all_paths
        import sys
        import csv
        from collections import defaultdict
        csv.field_size_limit(sys.maxsize)
        id_col = 'name'

        pairs = []
        with open(path_file, 'r') as f:
            header = next(f)
            for line in f:
                src, tgt = line.strip().split('\t')
                pairs.append((src, tgt))
        filename: Path = KGTKArgumentParser.get_input_file(input_file)
        with open(filename, 'r') as f:
            header = next(f).strip().split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header, options=['property', 'predicate', 'label'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)

        if 'id' not in p:
            raise KGTKException('Error: no id column found')
        G = load_graph_from_csv(str(filename),
                                skip_first=True,
                                directed=directed,
                                hashed=True,
                                ecols=[subj_index, obj_index],
                                eprop_names=p,
                                csv_options={'delimiter': '\t'})

        sys.stdout.write('node1\tlabel\tnode2\tid\n')
        id_count = 0
        if not output_stats:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                sys.stdout.write(
                    '%s\t%s\t%s\t%s\n' %
                    (G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                     '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)))
                id_count += 1

        id_count = 0
        path_id = 0
        paths = defaultdict(set)
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                for path in all_paths(G,
                                      source_id,
                                      target_id,
                                      cutoff=max_hops,
                                      edges=True):
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1 = 'p%d' % path_id
                        sys.stdout.write(
                            '%s\t%d\t%s\t%s\n' %
                            (node1, edge_num, edge_id, '{}-{}-{}'.format(
                                node1, edge_num, id_count)))
                        id_count += 1
                    path_id += 1

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Ejemplo n.º 24
0
def matchGraphs(graph1, graph2):
    g1, pos1, weight1, clase1, nodetype1, age1 = graph1
    g2, pos2, weight2, clase2, nodetype2, age2 = graph2

    vertices1 = g1.get_vertices()
    vertices2 = g2.get_vertices()

    age = np.zeros_like(vertices1)

    for i in range(0, len(vertices1)):
        age[i] = age1[vertices1[i]]

    vertices1 = np.argsort(age)[::-1]

    pos_vertex2 = []
    for i in vertices2:
        pos_vertex2.append(pos2[i])
    pos_vertex2 = np.array(pos_vertex2)

    for i in vertices1:
        p1 = pos1[i]
        v, d = find_nearest_nodes(p1, pos_vertex2, 15)

        v = v[np.argsort(d)]
        if len(v) == 1:
            age2[v[0]] = age1[i] + 1
            if nodetype2[v[0]] == "null":
                nodetype2[v[0]] = nodetype1[i]
        elif len(v) == 2:
            n_0 = len(g1.get_out_neighbours(i))
            n_1 = len(g2.get_out_neighbours(v[0]))
            n_2 = len(g2.get_out_neighbours(v[1]))

            dif_a = np.abs(n_1 - n_0)
            dif_b = np.abs(n_2 - n_0)
            if dif_a <= dif_b:
                age2[v[0]] = age1[i] + 1
                if nodetype2[v[0]] == "null":
                    nodetype2[v[0]] = nodetype1[i]
            else:
                age2[v[1]] = age1[i] + 1
                if nodetype2[v[1]] == "null":
                    nodetype2[v[1]] = nodetype1[i]

    available = np.ones(pos_vertex2.shape[0])

    ## If seed is not found => debug
    seed = gt.find_vertex(g2, nodetype2, "Ini")
    if len(seed) == 1:
        seed = seed[0]
    else:
        seed_prev = gt.find_vertex(g1, nodetype1, "Ini")
        p1 = pos1[seed_prev[0]]
        v, d = find_nearest_b(p1, pos_vertex2)
        if d < 20:
            age2[v] = age1[seed_prev[0]] + 1
            nodetype2[v] = "Ini"
            seed = g2.vertex(v)
            available[v] = 0
        else:
            print('No SEED')
            raise Exception("BAD TRACKING")

    pos_vertex2[:, 0] = pos_vertex2[:, 0] * available
    pos_vertex2[:, 1] = pos_vertex2[:, 1] * available

    ## If main root tip is not found => debug
    end = gt.find_vertex(g2, nodetype2, "FTip")
    if len(end) == 1:
        end = end[0]
    else:
        end_prev = gt.find_vertex(g1, nodetype1, "FTip")
        p1 = pos1[end_prev[0]]
        v, d = find_nearest_b(p1, pos_vertex2)
        if d < 20:
            age2[v] = age1[end_prev[0]] + 1
            nodetype2[v] = "FTip"
            end = g2.vertex(v)
        else:
            v = np.argmax(pos_vertex2[:, 1])
            nodetype2[v] = "FTip"
            end = g2.vertex(v)
            # print('No TIP')
            # raise Exception ("BAD TRACKING")

    vertices2 = g2.get_vertices()
    for i in vertices2:
        if age2[i] == 0:
            age2[i] == 1
        if nodetype2[i] == "null":
            vecinos = g2.get_out_neighbours(i)
            if len(vecinos) > 1:
                nodetype2[i] = "Bif"
            else:
                if len(vecinos) == 1:
                    nodetype2[i] = "LTip"

    # edge tracking
    camino, _ = gt.shortest_path(g2, seed, end, weights=weight2)

    l = len(camino)
    for k in range(0, l - 1):
        arista = g2.edge(camino[k], camino[k + 1])
        clase2[arista][1] = 10

    return [g2, pos2, weight2, clase2, nodetype2, age2]
Ejemplo n.º 25
0
def main():
    # params
    IEXCLOUD_TOKEN = os.getenv("IEXCLOUD_TOKEN")

    # 需要事前提供的檔案

    # 在執行時會自動產生的檔案
    output_folder = './outputs'
    downlaods_folder = './downloads'
    entities_json = f"{output_folder}/wd_entities.json"
    tk_csv = './downloads/bats_symbols_traded_byx.csv'
    tk_info_json = "./downloads/iex_ticker_info.json"
    urls_json = f"{output_folder}/wiki_urls.json"
    mentions_json = f"{output_folder}/wiki_mentions.json"
    sent_cooccurs_json = f"{output_folder}/corpus_mentions_sent_cooccurs.json"
    atk_cooccurs_json = f"{output_folder}/corpus_mentions_atk_cooccurs.json"
    atk_bags_json = f"{output_folder}/corpus_mentions_atk_bags.json"
    freqs_json = f"{output_folder}/corpus_mentions_freqs.json"

    # Wiki processor requires:
    explore_n_wk_depth: int = 2  # 探索wk的層數
    adpot_n_wk_depth: int = 1  # 在n層以內的wk-titles會被實際採用(其他用作graph計算)
    wkd_dump_json = "./downloads/latest-all.json.bz2"
    seeded_wk_titles = []
    sp500_csv = f"{downlaods_folder}/s_and_p_500.csv"

    # Wiki processor outputs:
    wk_titles_graphml = f"{output_folder}/wk_titles.graphml.bz2"
    wk_pagerank_json = f"{output_folder}/wk_pagerank.json"
    wk_cat_tags_json = f"{output_folder}/wk_cat_tags.json"
    # wk_tags_json = f"{output_folder}/wk_tags.json"
    wk_tags_pagerank_csv = f"{output_folder}/wk_tags_pagerank.csv"

    wkd_filtered_entities_json = f"{output_folder}/wkd_filtered_entities.json"
    wk_ranked_titles_json = f"{output_folder}/wk_ranked_titles.json"
    wkd_entites_by_ranked_titles_json = f"{output_folder}/wkd_entites_by_ranked_titles.json"

    pathlib.Path(output_folder).mkdir(exist_ok=True)

    # print(get_matched_wkd_entities(titles, wkd_dump_path=wkd_dump_json))
    # entities = load_or_run(wkd_entites_by_ranked_titles_json,
    #                     lambda: get_matched_wkd_entities(titles, wkd_dump_path=wkd_dump_json),
    #                     forcerun=True)

    # print("從wikidata取得具有symbol屬性的entities")
    # results = load_or_run(
    #     entities_json, lambda: query_wikidata_by_property())
    # comp_wdids = [e['item']['value'].split('/')[-1]
    #               for e in results['results']['bindings']]

    # print("找wikidata-entity對應的wikipage")
    # comp_titles = load_or_run(
    #     comp_titles_json, lambda: query_wikipage_title(comp_wdids))
    # return

    # print("讀取tickers")
    # df = pd.read_csv(tk_csv)
    # tickers = list(df['Symbols'])
    # # tickers = ['ADBE', 'BA', 'RXT', 'TTOO']
    # print(f"載入ticker數量: {len(tickers)}")

    # print("從iexcloud抓ticker info")
    # infos = load_or_run(
    #     tk_info_json, lambda: download_ticker_info_from_iexcloud(tickers, IEXCLOUD_TOKEN))
    # names = [v['companyName'] for k, v in infos.items()]

    # print("找ticker-info中的公司名搜尋對應的wikipage")
    # urls = load_or_run(
    #     urls_json, lambda: search_wikipage(names))

    #  掃wikipedia-dump,從company的wiki-page開始抓裡面的mentions
    #  將新加入的mentions設為next_entities,重複抓取n次(=爬n層)
    # print(f"取得跟公司關聯的mentions - {depth}階層")
    # titles = [v.split('/')[-1].replace("_", " ")
    #           for _, v in urls.items() if v is not None]

    print(f"# 連線elasticsearch(用於存放wiki-page, news-corpus)")
    es.connect(["es:9200"])

    print(f"# 以S&P500 wikipage為起點,爬'{explore_n_wk_depth}階層'的titles,建立graph")
    # seedtitles = ["List of S&P 500 companies"]
    seedtitles = ["Wilson (company)"]
    try:
        # raise FileNotFoundError
        g = gt.load_graph(wk_titles_graphml)
        print(f"File loaded: {wk_titles_graphml}")
    except FileNotFoundError:
        print(f"File not found, create new one")
        g = get_wktitles_graph(seedtitles, n_depth=explore_n_wk_depth)
        for n in g:
            g.nodes[n]['mentions'] = json.dumps(g.nodes[n]['mentions'],
                                                ensure_ascii=False,
                                                default=serialize_sets)
        nx.write_graphml_lxml(g, wk_titles_graphml)
        g = gt.load_graph(wk_titles_graphml)

    print("# 使用完整的graph跑pagerank(為避免記憶體不足,改用graph-tool庫)")
    ranks = load_or_run(wk_pagerank_json,
                        lambda: calc_pagerank(g),
                        forcerun=True)

    print(f"# 挑出graph中的wiki-category,再找主要描述此category的wiki-title")

    def _cat_tags() -> Iterable[str]:
        _, wk_title, _ = zip(*ranks)
        cats = filter(lambda e: "Category:" in e, wk_title)
        # print(list(cats))
        # print([c for c in cats])
        tags = [es.get_corresponded_wktitles(cat_title=c) for c in cats]
        tags = set(itertools.chain(*tags))
        # tags &= set(tags)
        return tags

    cat_tags = load_or_run(wk_cat_tags_json,
                           lambda: _cat_tags(),
                           forcerun=True)

    print(f"# 依照wk-title找尋對應的wkd-entity")

    # tags = ["Technology", "Internet", "Metal"]
    cattag_entity = get_matched_wkd_entities(cat_tags)
    ranks_by_tags = []
    for _, wk_title, pagerank in ranks:
        try:
            e = cattag_entity[wk_title]
            ranks_by_tags.append((e.entity_id, e.get_enwiki_title(),
                                  e.get_label("zh"), pagerank))
        except KeyError:
            pass

    print("# 將ranks存成csv格式")
    wkd_id, wk_title, zh_label, pagerank = zip(*ranks_by_tags)
    tags = wk_title
    df = pd.DataFrame({
        'wkd_id': wkd_id,
        'wk_title': wk_title,
        'zh_label': zh_label,
        'pagerank': pagerank
    })
    df.to_csv(wk_tags_pagerank_csv, index=False)

    return

    print("# 找一個ticker的tags")

    def get_neighbors(v: gt.Vertex, n_expands: int = 2):
        seeds = set([v])
        traveled = set()
        for i in range(n_expands):
            nextseeds = set()
            for v in seeds:
                nextseeds |= set(v.out_neighbors())
            nextseeds -= seeds
            traveled |= seeds
            seeds = nextseeds
        return traveled

    # tags = set(["joint venture"])
    tickers = ["Wilson (company)"]
    tags_by_tickers = []
    for tk in tickers:
        v = gt.find_vertex(g, g.vp['_graphml_vertex_id'], tk)[0]
        neighbors = get_neighbors(v, n_expands=2)
        neighbors = set([g.vp['_graphml_vertex_id'][v] for v in neighbors])
        tags_by_tickers.append((tk, tags & neighbors))
    print(tags_by_tickers)

    return

    print(f"tag的排序、重要度、重複性(用max_flow、n_path之類的方式)")
    # for tk in tickers:
    #     neighbors = get_neighbors(tk)

    print(f"TODO:巡所有的news,計算mentions的詞頻")

    # print(f"巡所有的news,計算mentions的詞頻")

    # TODO: 擴展同義詞(用於flashtext)
    # print(f"載入S&P500,做為seed-wk-titles")
    # df = pd.read_csv(sp500_csv)
    # seedtitles = list(df['Name'])

    # print(f"以seed-wk-titles為起點,爬'{explore_n_wk_depth}階層'的wk-titles,建立graph")
    # try:
    #     # raise FileNotFoundError
    #     g = gt.load_graph(wk_titles_graphml)
    #     print(f"File loaded: {wk_titles_graphml}")
    # except FileNotFoundError:
    #     print(f"File not found, create new one")
    #     g = get_wktitles_graph(seedtitles, n_depth=explore_n_wk_depth)
    #     for n in g:
    #         g.nodes[n]['mentions'] = json.dumps(
    #             g.nodes[n]['mentions'], ensure_ascii=False, default=serialize_sets)
    #     nx.write_graphml_lxml(g, wk_titles_graphml)
    #     g = gt.load_graph(wk_titles_graphml)

    # print(f"僅採用{adpot_n_wk_depth}-depth的wk-titles")
    # vp_label = g.vp['_graphml_vertex_id']
    # vp_depth = g.vp['depth']
    # wktitles = [vp_label[v]
    #             for v in g.vertices() if vp_depth[v] <= adpot_n_wk_depth]

    # print("掃wkd-dump,將沒有中文名、有位置claim(很可能是地點)、是人的wk-titles排除")
    # try:
    #     raise FileNotFoundError
    #     entities = WikidataJsonDump(wkd_filtered_entities_json)
    #     filtered_wktitles = set([e.get_enwiki_title() for e in entities])
    #     print(f"File loaded: {wkd_filtered_entities_json}")
    # except FileNotFoundError:
    #     print(f"File not found, create new one")
    #     entities = get_matched_wkd_entities(
    #         wktitles, wkd_dump_path=wkd_dump_json)
    #     dump_entities_to_json(entities, wkd_filtered_entities_json)
    #     filtered_wktitles = set([e.get_enwiki_title() for e in entities])

    # print("使用完整的graph跑pagerank(為避免記憶體不足,改用graph-tool庫)")
    # load_or_run(wk_filtered_pagerank_json,
    #             lambda: calc_pagerank(g, included_wktitles=filtered_wktitles), forcerun=True)

    return
Ejemplo n.º 26
0
    '~/project/KB_dump/conceptnet/conceptnet-en.csv')

g = load_graph_from_csv(conceptnet_path,
                        directed=False,
                        eprop_types=['string', 'string'],
                        string_vals=True)

prefix = '/c/en/'
entities = [
    ['capoeira', 'hand', 'cartwheel', 'shirt', 'handstand'],
    ['sunscreen', 'skateboarding', 'soccer', 'tan', 'rubbing'],
    ['cream', 'mascara', 'writing', 'lifting', 'dictaphone'],
]

blackListVertex = set([
    find_vertex(g, prop=g.properties[('v', 'name')], match=prefix + b)[0]
    for b in ['object', 'thing']
])

blackListEdge = set(['/r/DerivedFrom', '/r/RelatedTo'])
print('#' * 20)
for elist in entities:
    print(elist)
    qid = find_vertex(g,
                      prop=g.properties[('v', 'name')],
                      match=prefix + elist[0])[0]
    for a in elist[1:]:
        aid = find_vertex(g,
                          prop=g.properties[('v', 'name')],
                          match=prefix + a)[0]
        for vp, ep in zip(
Ejemplo n.º 27
0
def run(input_file: KGTKFiles, directed, max_hops, source_nodes, target_nodes):
    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        from kgtk.exceptions import KGTKException
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        from graph_tool.all import find_vertex
        from graph_tool.topology import all_paths
        import sys
        from collections import defaultdict

        id_col = 'name'
        graph_edge = 'graph'

        filename: Path = KGTKArgumentParser.get_input_file(input_file)
        filename = str(filename)
        with open(filename, 'r') as f:
            header = next(f).split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header, options=['property', 'predicate', 'label'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)

        if 'id' not in p:
            raise KGTKException('Error: no id column found')

        G = load_graph_from_csv(filename,
                                skip_first=True,
                                directed=directed,
                                hashed=True,
                                ecols=[subj_index, obj_index],
                                eprop_names=p,
                                csv_options={'delimiter': '\t'})

        graph_id = 1
        paths = defaultdict(set)
        for source_node in source_nodes:
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            if len(source_ids) == 1:
                source_id = source_ids[0]
                for target_node in target_nodes:
                    target_ids = find_vertex(G,
                                             prop=G.properties[('v', id_col)],
                                             match=target_node)
                    if len(target_ids) == 1:
                        target_id = target_ids[0]
                        for path in all_paths(G,
                                              source_id,
                                              target_id,
                                              cutoff=max_hops,
                                              edges=True):
                            for an_edge in path:
                                edge_id = G.properties[('e', 'id')][an_edge]
                                paths[edge_id].add(str(graph_id))
                            graph_id += 1

        sys.stdout.write('node1\tlabel\tnode2\tid\t%s\n' % graph_edge)
        for e in G.edges():
            sid, oid = e
            edge_id = G.properties[('e', 'id')][e]
            lbl = G.ep[predicate][e]
            graph_id = '|'.join(list(paths[edge_id]))
            sys.stdout.write(
                '%s\t%s\t%s\t%s\t%s\n' %
                (G.vp[id_col][sid], lbl, G.vp[id_col][oid], edge_id, graph_id))

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Ejemplo n.º 28
0
    def neighbor_graphs(self):
        for o in self.organisms:

            builder = self.builder[o]
            cofactors = set([c.id for c in builder.cofactor_list])

            g = self.builder[o].arcgraph
            model_reactions = {
                r.id: r
                for r in self.builder[o].model.reactions
            }

            has_start_metabolite_in_graph = list()

            for nf in self.not_found[o]:
                g.set_reversed(True)
                root = gt.find_vertex(g, g.vertex_properties["compound_ids"],
                                      nf)[0]

                helper.add_properties(g)
                g.vp.size[root] = 100
                old_color = g.vp.color[root]
                g.vp.color[root] = "red"
                g.vp.besucht[root] = True
                visitor = helper.NeighborVisitor(g)
                gt.bfs_search(g, root, visitor)

                g.set_reversed(False)
                g.set_vertex_filter(g.vp.besucht)
                g.vp.size.a = 90 * 0.9**g.vp.dist.a + 10
                if any([v for v in g.vertices() if g.vp.color[v] == "yellow"]):
                    has_start_metabolite_in_graph.append(nf)

                # reset graph
                g.clear_filters()
                helper.remove_properties(g)
                g.vp.color[root] = old_color

            feasible_reactions = self.builder[o].feasible_reaction_list

            self.really_not_found[o] = has_start_metabolite_in_graph

            with open(join(const.MECAT_BASE, 'Targets '+o,'has_start_metabolite_in_graph.txt'),'w') as f1, \
                    open(join(const.MECAT_BASE, 'Targets ' + o, 'reactions_for_really_not_found.txt'), 'w') as f2:
                rrnf = list()

                substrates_not_found = set(has_start_metabolite_in_graph +
                                           self.not_found[o] +
                                           self.really_not_found[o])

                for c in has_start_metabolite_in_graph:
                    f1.write('{}\t{}\n'.format(c, self.compounds[c].names[0]))

                    reactions_for_target = self.__reactions_for_target(
                        c, model_reactions)

                    fr = set(reactions_for_target) & feasible_reactions
                    if len(fr) > 0:
                        rrnf.append(c)

                    f2.write('{}\t{}:\n'.format(c, self.compounds[c].names[0]))
                    for reac_id in fr:
                        t = ''
                        # test if the substrates of the reaction are all cofactors
                        substrates = set(
                            next(
                                itertools.ifilter(
                                    lambda x: x.id == reac_id,
                                    builder.model.reactions)).substrates())
                        if substrates <= cofactors:
                            t += ' keine Kante (Cofactors)'

                        # test if the reaction has arcs that end with the target
                        rpairs = builder.reaction_rpair[reac_id]
                        if not any((x[1] == c) for x in rpairs):
                            t += ' keine Kante zu Target'

                        #  test if any substrate is also in list of not found
                        if substrates & substrates_not_found:
                            t += ' Substrat auch nicht gefunden'

                        fx = re.findall('R\d{5}', reac_id)[0]
                        f2.write('\t{}\t{}\t{}\n'.format(
                            t, reac_id, self.reactions[fx].definition))

            self.really_really_not_found[o] = rrnf
        self.__dump_statistics(
            self.organisms, join(const.MECAT_BASE, 'targets_in_organisms.csv'))
def find_system_by_name(name):
    systems = gt.find_vertex(g, g.vertex_properties['system_name'], name)
    if len(systems) < 1:
        bottle.abort(404, "No such system.")
    
    return vtx_to_json(systems[0])
Ejemplo n.º 30
0
 def local_cells(self):
     for vertex in gt.find_vertex(self.eptm.graph, self.eptm.is_local_vert,
                                  1):
         if self.eptm.is_cell_vert[vertex]:
             yield vertex
Ejemplo n.º 31
0
    def set_precursors(self, precursors=None, complex_loopback=True):
        """
        Replaces network's previous precursor node with provided new precursors.
        Finds new edges that link products back to reactants as dependent on the
        complex_loopback parameter.

        Args:
            precursors ([ComputedEntry]): list of new precursor entries
            complex_loopback (bool): if True, adds zero-weight edges which "loop back"
                to allow for multi-step or autocatalytic-like reactions, i.e. original
                precursors can reappear many times and in different steps.

        Returns:
            None
        """
        g = self._g
        self._precursors = set(precursors) if precursors else None

        if not self._precursors:
            precursors_entries = RxnEntries(None,
                                            "d")  # use dummy precursors node
            if complex_loopback:
                raise ValueError(
                    "Complex loopback can't be enabled when using a dummy precursors "
                    "node!")
        else:
            precursors_entries = RxnEntries(precursors, "s")

        g.remove_vertex(gt.find_vertex(g, g.vp["type"], 0))
        new_precursors_v = g.add_vertex()

        self._update_vertex_properties(
            g,
            new_precursors_v,
            {
                "entries": precursors_entries,
                "type": 0,
                "bool": True,
                "path": True,
                "chemsys": precursors_entries.chemsys,
            },
        )

        new_edges = []
        remove_edges = []

        for v in gt.find_vertex(g, g.vp["type"],
                                1):  # iterate over all reactants
            phases = g.vp["entries"][v].entries

            remove_edges.extend(list(v.in_edges()))

            if precursors_entries.description == "D" or phases.issubset(
                    self._precursors):
                new_edges.append([new_precursors_v, v, 0, None, True, False])

        for v in gt.find_vertex(g, g.vp["type"],
                                2):  # iterate over all products
            phases = g.vp["entries"][v].entries

            if complex_loopback:
                combos = generate_all_combos(phases.union(self._precursors),
                                             self._max_num_phases)
            else:
                combos = generate_all_combos(phases, self._max_num_phases)

            for c in combos:
                combo_phases = set(c)
                if complex_loopback and combo_phases.issubset(
                        self._precursors):
                    continue
                combo_entry = RxnEntries(combo_phases, "R")
                loopback_v = gt.find_vertex(g, g.vp["entries"], combo_entry)[0]
                new_edges.append([v, loopback_v, 0, None, True, False])

        for e in remove_edges:
            g.remove_edge(e)

        g.add_edge_list(
            new_edges,
            eprops=[g.ep["weight"], g.ep["rxn"], g.ep["bool"], g.ep["path"]])
Ejemplo n.º 32
0
def run(
        input_file: KGTKFiles,
        path_file: KGTKFiles,
        output_file: KGTKFiles,
        statistics_only: bool,
        undirected: bool,
        max_hops: int,
        source_column_name: typing.Optional[str],
        target_column_name: typing.Optional[str],
        shortest_path: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool.all import find_vertex
    from graph_tool.topology import all_paths
    from graph_tool.topology import all_shortest_paths

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    from kgtk.exceptions import KGTKException
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="input", fallback=True)
        path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="path", fallback=True)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        id_col = 'name'

        if verbose:
            print("Reading the path file: %s" % str(path_kgtk_file),
                  file=error_file,
                  flush=True)
        pairs = []
        pkr: KgtkReader = KgtkReader.open(
            path_kgtk_file,
            error_file=error_file,
            options=path_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        path_source_idx: int = pkr.get_node1_column_index(source_column_name)
        if path_source_idx < 0:
            print("Missing node1 (source) column name in the path file.",
                  file=error_file,
                  flush=True)

        path_target_idx: int = pkr.get_node2_column_index(target_column_name)
        if path_target_idx < 0:
            print("Missing node1 (target) column name in the path file.",
                  file=error_file,
                  flush=True)
        if path_source_idx < 0 or path_target_idx < 0:
            pkr.close()
            raise KGTKException("Exiting due to missing columns.")

        paths_read: int = 0
        path_row: typing.List[str]
        for path_row in pkr:
            paths_read += 1
            if len(path_row) != pkr.column_count:
                raise KGTKException(
                    "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read."
                    % (paths_read, str(path_kgtk_file), pkr.column_count,
                       len(path_row)))
            src: str = path_row[path_source_idx]
            tgt: str = path_row[path_target_idx]
            pairs.append((src, tgt))
        pkr.close()
        if verbose:
            print("%d path rows read" % paths_read,
                  file=error_file,
                  flush=True)
        if len(pairs) == 0:
            print("No path pairs found, the output will be empty.",
                  file=error_file,
                  flush=True)
        elif verbose:
            print("%d path pairs found" % len(pairs),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sub_index: int = kr.get_node1_column_index()
        if sub_index < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred_index: int = kr.get_label_column_index()
        if pred_index < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj_index: int = kr.get_node2_column_index()
        if obj_index < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        id_index: int = kr.get_id_column_index()
        if id_index < 0:
            print("Missing id column", file=error_file, flush=True)
        if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred_index]
        id_col_name: str = kr.column_names[id_index]

        G = load_graph_from_kgtk(kr,
                                 directed=not undirected,
                                 ecols=(sub_index, obj_index),
                                 verbose=verbose,
                                 out=error_file)

        output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id']
        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        id_count = 0
        if not statistics_only:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                kw.write([
                    G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                    '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1
            if verbose:
                print("%d edges found." % id_count,
                      file=error_file,
                      flush=True)

        id_count = 0
        path_id = 0
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                if shortest_path:
                    _all_paths = all_shortest_paths(G,
                                                    source_id,
                                                    target_id,
                                                    edges=True)
                else:
                    _all_paths = all_paths(G,
                                           source_id,
                                           target_id,
                                           cutoff=max_hops,
                                           edges=True)

                for path in _all_paths:
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1: str = 'p%d' % path_id
                        kw.write([
                            node1,
                            str(edge_num), edge_id,
                            '{}-{}-{}'.format(node1, edge_num, id_count)
                        ])
                        id_count += 1
                    path_id += 1

        if verbose:
            print("%d paths contining %d edges found." % (path_id, id_count),
                  file=error_file,
                  flush=True)

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Ejemplo n.º 33
0
 def local_cells(self):
     for vertex in gt.find_vertex(self.eptm.graph,
                                  self.eptm.is_local_vert, 1):
         if self.eptm.is_cell_vert[vertex]:
             yield vertex
Ejemplo n.º 34
0
def Stochastic():

    import pandas as pd
    import numpy as np
    import pprint as pp
    import locale
    import matplotlib.pyplot as plt
    import matplotlib.ticker as tkr
    import graph_tool.all as gt
    import math

    # Need to drag this out into the real world
    from GAC_Graph_Builder import findEdges

    t = gt.Graph(directed=True)

    tprop_label = t.new_vertex_property("string")
    tprop_instType = t.new_vertex_property("string")

    linkDict, instSet = findEdges()

    # ingest our university checking lists [this is sloppy, TBI]

    foreignUniTxt = open('Workaround txts/Foreign Unis.txt', 'r')
    UKUniTxt = open('Workaround txts/UK Unis.txt', 'r')

    forerignUniVals = foreignUniTxt.read().splitlines()
    UKUniVals = UKUniTxt.read().splitlines()

    # add vertices and label them based on their names.

    ######## FILTERING BASED ON CORDIS RESIDENCY ##########

    dfCordisNames = pd.read_pickle('Pickles/CORDIS_Countries.pickle')

    eligiblenames = dfCordisNames.name.values.tolist()

    veryDirtyWorkaround = ['FOCUS', 'FLUOR', 'GE', 'NI', 'OTE', 'ROKE']

    for inst in instSet:

        nameCheck = inst.upper()
        firstFound = next((x for x in eligiblenames if nameCheck in x), None)
        if inst in forerignUniVals:
            del (linkDict[inst])
        elif nameCheck in veryDirtyWorkaround:
            del (linkDict[inst])
        elif firstFound is None:
            del (linkDict[inst])
        else:
            vert = t.add_vertex()
            tprop_label[vert] = str(inst)

    del (linkDict[''])

    # internalise property map
    t.vertex_properties["label"] = tprop_label

    # explicitly declare the hierarchy defining vertices and edges, the sequencing here matters.
    for_uni = t.add_vertex()
    UK_uni = t.add_vertex()
    other = t.add_vertex()
    root = t.add_vertex()

    edgeList = [(root, for_uni), (root, UK_uni), (root, other)]
    t.add_edge_list(edgeList)

    # use label name to add edges to hierarchy
    for i in range(t.num_vertices())[:-4]:
        if tprop_label[i] in forerignUniVals:
            t.add_edge(for_uni, t.vertex(i))
            tprop_instType[i] = "Foreign Uni"
        elif tprop_label[i] in UKUniVals:
            t.add_edge(UK_uni, t.vertex(i))
            tprop_instType[i] = "UK Uni"
        else:
            t.add_edge(other, t.vertex(i))
            tprop_instType[i] = "Other Institution"

    t.vertex_properties["instType"] = tprop_instType
    tpos = gt.radial_tree_layout(t,
                                 t.vertex(t.num_vertices() - 1),
                                 rel_order_leaf=True)

    ######### MAIN GRAPH DRAWING ################

    g = gt.Graph(directed=False)
    # creates graph g, using the same nodes (with the same index!)

    for v in t.vertices():
        gv = g.add_vertex()

    # we remove: root, for_uni, uk_uni or 'other' vertices

    lower = g.num_vertices() - 5
    current = g.num_vertices() - 1

    while current > lower:
        g.remove_vertex(current)
        current -= 1

    # Pull vertex properties from t

    labelDict = t.vertex_properties["label"]
    instTypeDict = t.vertex_properties["instType"]

    # create properties for g vertices

    gprop_label = g.new_vertex_property("string")
    gprop_instType = g.new_vertex_property("string")

    # match labels between g and t

    for v in g.vertices():
        gprop_label[v] = labelDict[v]
        gprop_instType[v] = instTypeDict[v]

    # make property map internal to graph g
    g.vertex_properties["label"] = gprop_label
    g.vertex_properties["instType"] = gprop_instType

    ###### COLOUR VERTICES #########

    # Reclaim variable names because lazy

    gprop_vcolour = g.new_vertex_property("string")

    for v in g.vertices():

        if gprop_instType[v] == "Foreign Uni":
            gprop_vcolour[v] = "red"
        elif gprop_instType[v] == "UK Uni":
            gprop_vcolour[v] = "blue"
        else:
            gprop_vcolour[v] = "white"

    g.vertex_properties["vcolour"] = gprop_vcolour

    # create numLinks edge property for g edges

    eprop_numLinks = g.new_edge_property("int")

    # creates the edges between nodes

    for i in linkDict:
        for n in linkDict[i]:
            #print(i)
            vertex_i = gt.find_vertex(g, gprop_label, i)[0]
            #print(n)
            try:
                vertex_n = gt.find_vertex(g, gprop_label, n)[0]
                e = g.add_edge(vertex_i, vertex_n)
                eprop_numLinks[e] = linkDict[i][n]
            except:
                IndexError

    ##### EXPERIMENTAL SIZE THINGS ######

    #gvprop_size = g.new_vertex_property('float')

    deleteList = []

    for v in g.vertices():

        # sum the num edges and the number of links they correspond to
        # use this to find a ratio and scale size off of this.

        numEdges = sum(1 for _ in v.all_edges())
        numLinks = 0

        for e in v.all_edges():

            numLinks += eprop_numLinks[e]

        #print(gprop_label[v])
        print("NumEdges = " + str(numEdges) + " NumLinks = " + str(numLinks))
        # create a delete list

        try:
            ratio = (numLinks / numEdges) * 5 * 2
        except:
            ZeroDivisionError
            deleteList.append(v)

        #gvprop_size[v] = ratio

    #g.vertex_properties['size'] = gvprop_size

    #### Delete linkless vertices #######

    for v in reversed(sorted(deleteList)):
        g.remove_vertex(v)

    for v in reversed(sorted(deleteList)):
        t.remove_vertex(v)

    tpos = gt.radial_tree_layout(t,
                                 t.vertex(t.num_vertices() - 1),
                                 rel_order_leaf=True)

    #######

    ############ stochastic BLOCK MODEL ####################

    state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True, verbose=True)
    t = gt.get_hierarchy_tree(state)[0]
    tpos = pos = gt.radial_tree_layout(t,
                                       t.vertex(t.num_vertices() - 1),
                                       weighted=True)

    # in order to make sure labels fit in the image we have to manually adjust the
    # co-ordinates of each vertex.

    x, y = gt.ungroup_vector_property(tpos, [0, 1])
    x.a = (x.a - x.a.min()) / (x.a.max() - x.a.min()) * 1400 + 400
    y.a = (y.a - y.a.min()) / (y.a.max() - y.a.min()) * 1400 + 400
    tpos = gt.group_vector_property([x, y])

    # This draws the 'Bezier spline control points' for edges
    # it draws the edges directed in graph g, but uses the hierarchy / positioning of graph t.
    cts = gt.get_hierarchy_control_points(g, t, tpos)

    pos = g.own_property(tpos)

    gt.graph_draw(
        g,
        vertex_text_position="centered",
        vertex_text=g.vertex_properties["label"],
        vertex_font_size=14,
        vertex_anchor=0,
        vertex_aspect=1,
        vertex_shape="square",
        vertex_fill_color=g.vertex_properties["vcolour"],
        vertex_size=10,
        fit_view=False,
        # edge_color=g.edge_properties["colour"],
        # edge_pen_width=g.edge_properties["thickness"],
        edge_end_marker="none",
        edge_pen_width=0.2,
        edge_color="white",
        bg_color=[0, 0, 0, 1],
        output_size=[2000, 2000],
        output='UK_ONLY_RELATIONSHIPS_stochastic.png',
        pos=pos,
        edge_control_points=cts)

    if __name__ == '__main__':
        pyjd.setup("Hello.html")
Ejemplo n.º 35
0
 def find_nodes(self,prop,match):
   return gt.find_vertex(self.graph,prop,match)
Ejemplo n.º 36
0
    def topology(self):
        g = self.arcgraph.copy()
        components, chist = gt.label_components(
            g, directed=False
        )  # directed = False because True would look for strongly connected components
        self.__plot_component_hist(chist, 'componenthist')
        start_components = set()
        number_compounds_in_start_components = 0
        for c in self.start_compounds:
            for v in gt.find_vertex(g, g.vp.compound_ids, c):
                start_components.add(components[v])

        cg = gt.Graph()
        cg.vertex_properties["size"] = cg.new_vertex_property("int", val=10)
        for c in start_components:
            v = cg.add_vertex()
            cg.vp.size[v] = chist[c]
            number_compounds_in_start_components += chist[c]

        satellites = set()

        clustering_coefficient = gt.global_clustering(g)
        with open(join(self.statistics_path, "clustering_coefficient.txt"),
                  'w') as f:
            f.write(
                str(clustering_coefficient[0]) + '\t' +
                str(clustering_coefficient[1]) + '\n')

        with open(join(self.statistics_path, "compounds_components.txt"), 'w') as f, \
                open(join(self.statistics_path, "component_hist.txt"), 'w') as f2:

            for componentid, elem in enumerate(chist):
                u = gt.GraphView(g, vfilt=components.a == componentid)
                u = gt.Graph(u, prune=True)

                f2.write(str(componentid + 1) + '\t' + str(elem) + '\n')

                for v in u.vertices():
                    f.write(
                        str(componentid + 1) + '\t' + u.vp.compound_ids[v] +
                        '\t' + u.vp.name[v] + '\n')

                    if componentid not in start_components:
                        satellites.add(u.vp.compound_ids[v])

                # gt.graph_draw(u, output=join(self.statistics_path, "component{i}.pdf".format(i=componentid)))

        targets_in_main_component = self.targets - satellites
        targets_in_satellites = self.targets & satellites

        with open(join(self.statistics_path, "targets_in_main_component.txt"),
                  'w') as f:
            for c in targets_in_main_component:
                compound = self.builder.compounds[c]
                f.write(c + '\t' + compound.names[0] + '\n')

        with open(join(self.statistics_path, "targets_in_satellites.txt"),
                  'w') as f:
            for c in targets_in_satellites:
                compound = self.builder.compounds[c]
                f.write(c + '\t' + compound.names[0] + '\n')

        with open(
                join(self.statistics_path,
                     "components_with_start_metabolites.txt"), 'w') as f:
            for cid in start_components:
                f.write(str(cid) + '\n')

        p = number_compounds_in_start_components / g.num_vertices() * 100

        with open(join(const.MECAT_BASE, "component_table.txt"), 'a') as f:
            f.write(self.name + ' & ' + str(len(chist)) + ' & ' +
                    str(np.amax(chist)) + ' & ' + str(len(start_components)) +
                    ' & ' + str(int(number_compounds_in_start_components)) +
                    ' & ' + str(int(round(p, 0))) + '\%' + '\\\\ \n')

        #largest = gt.label_largest_component(g, directed=False)
        #gt.graph_draw(g, vertex_fill_color=largest, output=join(self.statistics_path,"largest_component.pdf"))

        g.vertex_properties["start_components"] = g.new_vertex_property(
            "string", val='white')

        for v in g.vertices():
            if components[v] in start_components:
                g.vp.start_components[v] = 'red'
            else:
                g.vp.start_components[v] = 'blue'

        gt.graph_draw(g,
                      vertex_fill_color=g.vp.start_components,
                      output=join('/mnt', 'g', 'LisaDaten', 'Paper2',
                                  'figures', 'arcgraph' + self.name + '.pdf'))
Ejemplo n.º 37
0
#add username property map
name_prop = g.new_vertex_property("string")
g.vertex_properties['name'] = name_prop

#add names to each vertex
for v in g.vertices():
    g.vp.name[v] = cursor[g.vertex_index[v]]['user']
    #print(g.vertex_properties['name'][v])

cursor.rewind()

#create all edges
for user in cursor:
    #v1 is the vertex where name = cursor['user']
    v1 = gt.find_vertex(g, g.vp.name, user['user'])[0]
    for mention in user['user_mentions']:
        try:
            # v2 is the vertex where name = mention
            v2 = gt.find_vertex(g, g.vp.name, mention)[0]
        except IndexError:
            print("Error: " + mention + " is not in the collection")
            continue

        if g.vp.name[v1] != g.vp.name[v2]:
            print("adding edge between " + g.vp.name[v1] + " and " +
                  g.vp.name[v2])
            edge = g.add_edge(v1, v2)

#attempt at weighting the graph
pos = gt.sfdp_layout(g)