def top(): g = gt.load_graph(filename) print 'Graph loaded, now calculating top nodes' vblocks = g.vp['blocks'] largest_block = max(range(blocks), key=lambda b: len(gt.find_vertex(g, vblocks, b))) print 'Largest block is %d with %d nodes' % (largest_block, len(gt.find_vertex(g, vblocks, largest_block))) for tup in top_ids(g, largest_block): print tup
def main(): input_fasta = sys.argv[3] K = int(sys.argv[1]) x = float(sys.argv[2]) ht = khmer.Nodegraph(K, x, 4) sparse_graph = gt.Graph() hashes = sparse_graph.new_vertex_property("long long") for n, record in enumerate(screed.open(input_fasta)): if n % 1000 == 0: print('...loaded and tagged {} sequences'.format(n), file=sys.stderr) name = record.name sequence = record.sequence ht.consume_sequence_and_tag_with_labels(sequence, n) tags = ht.sweep_tag_neighborhood(sequence, 0) for i in range(len(tags) - 1): src = tags[i] dst = tags[i + 1] new = False srcv = gt.find_vertex(sparse_graph, hashes, src) if not srcv: srcv = sparse_graph.add_vertex() hashes[srcv] = src new = True else: srcv = srcv[0] dstv = gt.find_vertex(sparse_graph, hashes, dst) if not dstv: dstv = sparse_graph.add_vertex() hashes[dstv] = dst new = True else: dstv = dstv[0] if new: e = sparse_graph.add_edge(srcv, dstv) print('Sparse graph has {} nodes, {} edges'.format( sparse_graph.num_vertices(), sparse_graph.num_edges())) comp = gt.label_largest_component(sparse_graph, directed=False) #pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0)) gt.graph_draw(sparse_graph, output_size=(5000, 5000), output=input_fasta + '_sparse.png') sparse_graph.set_vertex_filter(comp) gt.graph_draw(sparse_graph, output_size=(5000, 5000), output=input_fasta + '_sparse_comp.png')
def closest_vert(self, sigma, zed): '''Return the vertices closer to a position `sigma`,`zed` ''' dist = np.hypot(self.sigmas.fa - sigma, self.zeds.fa - zed) idx = np.argmin(dist) sigma, zed = self.sigmas.fa[idx], self.zeds.fa[idx] s_matches = gt.find_vertex(self.graph, self.sigmas, sigma) z_matches = gt.find_vertex(self.graph, self.zeds, zed) log.debug('Number of closest vertices found: %i, %i' % (len(s_matches), len(z_matches))) return [v for v in s_matches if v in z_matches][0]
def main(): input_fasta = sys.argv[3] K = int(sys.argv[1]) x = float(sys.argv[2]) ht = khmer.new_hashbits(K, x, 4) sparse_graph = gt.Graph() hashes = sparse_graph.new_vertex_property("long long") for n, record in enumerate(screed.open(input_fasta)): if n % 1000 == 0: print >>sys.stderr, '...loaded and tagged {} sequences'.format(n) name = record.name sequence = record.sequence ht.consume_sequence_and_tag_with_labels(sequence, n) tags = ht.sweep_tag_neighborhood(sequence, 0) for i in xrange(len(tags) - 1): src = tags[i] dst = tags[i + 1] new = False srcv = gt.find_vertex(sparse_graph, hashes, src) if not srcv: srcv = sparse_graph.add_vertex() hashes[srcv] = src new = True else: srcv = srcv[0] dstv = gt.find_vertex(sparse_graph, hashes, dst) if not dstv: dstv = sparse_graph.add_vertex() hashes[dstv] = dst new = True else: dstv = dstv[0] if new: e = sparse_graph.add_edge(srcv, dstv) print 'Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges()) comp = gt.label_largest_component(sparse_graph, directed=False) #pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0)) gt.graph_draw(sparse_graph, output_size=( 5000, 5000), output=input_fasta + '_sparse.png') sparse_graph.set_vertex_filter(comp) gt.graph_draw(sparse_graph, output_size=( 5000, 5000), output=input_fasta + '_sparse_comp.png')
def top(): g = gt.load_graph(filename) print 'Graph loaded, now calculating top nodes' vblocks = g.vp['blocks'] blocks = sorted(range(nblocks), key=lambda b: len(gt.find_vertex(g, vblocks, b)), reverse=True) for block in blocks: print 'Block %d with %d nodes' % (block, len(gt.find_vertex(g, vblocks, block))) tups = top_ids(g, block, n) ids = [t[0] for t in tups] names = get_names(ids) for tup, name in zip(tups, names): print name, tup[0], tup[1]
def continue_graph(g, pos, weight, clase, img, actual, padre, enodes, bnodes): # padre refiere al indice en el grafo de graph_tool # actual es la posición del nodo en la imagen global c hijos = vecinos(img, actual) for i in hijos: img, nodo, largo_arista = get_next_node(img, i, actual, hijos, 0) s = gt.find_vertex(g, pos,nodo) if s == []: s = g.add_vertex() pos[s] = nodo else: s = s[0] arista = g.add_edge(padre, s) weight[arista] = largo_arista clase[arista] = [c, 0] c = c+1 if img[nodo[1],nodo[0]] == 1: img[nodo[1], nodo[0]] = c if nodo not in enodes.tolist(): g, pos, weight = continue_graph(g, pos, weight, clase, img, nodo, s, enodes, bnodes) return g, pos, weight
def _map_vertexs(self, graph, seeds): components = [[ gt.find_vertex(self._network, self._network.vp['hash'], graph.vp['hash'][v])[0] for v in seed ] for seed in seeds] return components
def community(): g = gt.load_graph(filename) print 'Graph loaded, now finding community' # state = gt.BlockState(g, B=blocks) # for i in xrange(iterations): # if i < iterations / 2: # gt.mcmc_sweep(state) # else: # gt.mcmc_sweep(state, beta=float('inf')) # g.vp['blocks'] = state.get_blocks() spins = {} if 'blocks' in g.vp: spins = {'spins': g.vp['blocks']} g.vp['blocks'] = gt.community_structure(g, n_iter=iterations, n_spins=blocks, **spins) if 'pos' in g.vp: gt.sfdp_layout(g, groups=g.vp['blocks'], pos=g.vp['pos']) for i in xrange(blocks): print '%d nodes in block %d' % (len(gt.find_vertex(g, g.vp['blocks'], i)), i) g.save(filename)
def generate_graph(arcos_fuertes, arcos_debiles): ''' A los arcos fuertes se les asigna un peso 100 veces mayor que a los arcos débiles ''' g = gt.Graph(directed=True) # etiqueta_nodo=g.new_vertex_property('string') g.vp['etiqueta_nodo'] = g.new_vertex_property('string') num_vertices = len( set(zip(*arcos_fuertes)[0]).union( set(zip(*arcos_fuertes)[1]).union( set(zip(*arcos_debiles)[0]).union(set( zip(*arcos_debiles)[1]))))) for i in range(num_vertices): u = g.add_vertex() g.vp['etiqueta_nodo'][u] = str(i) # etiqueta_arco=g.new_edge_property('string') g.ep["etiqueta_arco"] = g.new_edge_property('string') # peso_arco=g.new_edge_property('float') g.ep["peso_arco"] = g.new_edge_property('float') for j in arcos_fuertes: u = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[0])) v = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[1])) e = g.add_edge(u[0], v[0]) a = 100 * np.random.rand() if (a < 10): a = 10 + (10 * np.random.rand()) g.ep['etiqueta_arco'][e] = str(round(a, 1)) g.ep['peso_arco'][e] = round(a, 1) for j in arcos_debiles: u = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[0])) v = gt.find_vertex(g, g.vp['etiqueta_nodo'], str(j[1])) e = g.add_edge(u[0], v[0]) a = np.random.rand() g.ep['etiqueta_arco'][e] = str(round(a, 1)) g.ep['peso_arco'][e] = round(a, 1) # gt.graph_draw(g,vertex_text=etiqueta_nodo,edge_text=etiqueta_arco,vertex_size=8) return g
def top_ids(g, block, n=10): '''List the most central node ids in a given block''' vid = g.vp['id'] vrank = g.vp['rank'] vertices = gt.find_vertex(g, g.vp['blocks'], block) sorted_vertices = sorted(vertices, key=lambda v: vrank[v], reverse=True) mapped_vertices = imap(lambda v: (vid[v], vrank[v]), sorted_vertices) return take(n, mapped_vertices)
def find_k_shortest_paths(self, k, verbose=True): """ Finds k shortest paths to current target using Yen's Algorithm. Args: k (int): desired number of shortest pathways (ranked by cost) verbose (bool): whether to print all identified pathways to the console. Returns: [RxnPathway]: list of RxnPathway objects containing reactions traversed on each path. """ g = self._g paths = [] precursors_v = gt.find_vertex(g, g.vp["type"], 0)[0] target_v = gt.find_vertex(g, g.vp["type"], 3)[0] for num, path in enumerate(self._yens_ksp(g, k, precursors_v, target_v)): rxns = [] weights = [] for step, v in enumerate(path): g.vp["path"][v] = True if (g.vp["type"][v] == 2 ): # add rxn step if current node in path is a product e = g.edge(path[step - 1], v) g.ep["path"][ e] = True # mark this edge as occurring on a path rxns.append(g.ep["rxn"][e]) weights.append(g.ep["weight"][e]) rxn_pathway = RxnPathway(rxns, weights) paths.append(rxn_pathway) if verbose: for path in paths: print(path, "\n") return paths
def set_target(self, target): """ Replaces network's current target phase with new target phase. Args: target (ComputedEntry): ComputedEntry-like object for new target phase. Returns: None """ g = self._g if target in self._current_target: return else: self._current_target = {target} g.remove_vertex(gt.find_vertex(g, g.vp["type"], 3)) new_target_entry = RxnEntries(self._current_target, "t") new_target_v = g.add_vertex() self._update_vertex_properties( g, new_target_v, { "entries": new_target_entry, "type": 3, "bool": True, "path": True, "chemsys": new_target_entry.chemsys, }, ) new_edges = [] for v in gt.find_vertex(g, g.vp["type"], 2): # search for all products if self._current_target.issubset(g.vp["entries"][v].entries): new_edges.append([v, new_target_v, 0, None, True, False]) # link all products to new target g.add_edge_list( new_edges, eprops=[g.ep["weight"], g.ep["rxn"], g.ep["bool"], g.ep["path"]])
def get_vertex_and_create_if_not_exist(group_name: str, username: str): v = gt.find_vertex(graph, graph.vp.username, username) if not v: v = graph.add_vertex() graph.vp.group[v] = GROUPS.index(group_name) graph.vp.group_name[v] = group_name graph.vp.username[v] = username return v else: return v[0]
def v_id(name, g, v_name): """Return integer index of vertex with given `name` Args: name - name of vertex (str) g - graph tool graph v_name - vertex property array giving names for each vertex as returned when calling g.add_edge_list with hashed=True """ # obtain vertex object v = gt.find_vertex(g, v_name, name)[0] # get int index return g.vertex_index[v]
def make_filter(self, concepts: List[NODE_T], fltr): fltr.a = False for n in concepts: v = gtall.find_vertex( self.graph, prop=self.graph.properties[('v', '_graphml_vertex_id')], match=str(n))[0] # v = qtqu.get_nodes_by_node_prop(self.graph, '_graphml_vertex_id', n)[0] fltr[v] = True for vn in v.out_neighbors(): fltr[vn] = True return fltr
def citation_success(self,yr,yd,perc): #create property map citation_success=self.citation.new_vertex_property("double") citation_success_perc=self.citation.new_vertex_property("bool") perc_cuts=[] for y in yr: print y,'...' #find vertices y1_vertices = gt.find_vertex(self.citation,self.citation.vertex_properties['year'],y) y1yd_vertices = gt.find_vertex_range(self.citation,self.citation.vertex_properties['year'],[y,y+yd]) #set vertex filter property print 'Set filter prop...' y1yd_filter_prop=self.citation.new_vertex_property("bool") y1_filter_prop=self.citation.new_vertex_property("bool") y1yd_filter_prop.a=False y1_filter_prop.a=False for v in y1yd_vertices: y1yd_filter_prop[v]=True for v in y1_vertices: y1_filter_prop[v]=True #calculate graph_view of the subgraph of y,y+yd print 'Calc graph view ...' sub_cite_degree = self.citation.new_vertex_property("double") self.citation.set_vertex_filter(y1yd_filter_prop) sub_cite_degree.fa = self.citation.degree_property_map('out').fa #there are a lot of zeros ... so the percentile percentage has to be quite high self.citation.set_vertex_filter(None) self.citation.set_vertex_filter(y1_filter_prop) tmp = sub_cite_degree.fa percentile_cut = numpy.percentile(tmp,perc) perc_cuts.append(percentile_cut) print 'Percentile cut is ',percentile_cut self.citation.set_vertex_filter(None) print 'Write success ...' #write number of citations and success bool after yd years self.citation.set_vertex_filter(y1_filter_prop) citation_success.fa = tmp.copy() print 'There are ',numpy.count_nonzero(tmp>percentile_cut),' nodes exceeding the percentile cut.' citation_success_perc.fa = (tmp > percentile_cut).copy() self.citation.set_vertex_filter(None) return citation_success, citation_success_perc,perc_cuts
def build_result_graph(self, reactionlist, product, filename): g = gt.Graph() cofactors = [s.id for s in self.cofactor_list] generic_compound = [s.id for s in self.generic_compound_list] edges = [] for reaction in self.model.reactions: for c, val in reaction.reactants.iteritems(): if c not in cofactors and c not in generic_compound: if val < 0: edges.append((c, reaction.id)) else: edges.append((reaction.id, c)) ids = g.add_edge_list(edges, hashed=True, string_vals=True) g.vertex_properties["ids"] = ids g.vertex_properties["names"] = g.new_vertex_property("string") g.vertex_properties["color"] = g.new_vertex_property("string") g.vertex_properties["size"] = g.new_vertex_property("int") g.edge_properties["arrows"] = g.new_edge_property("string") for v in g.vertices(): v_id = g.vp.ids[v] g.vp.names[v] = v_id g.vp.color[v] = "red" if v_id in reactionlist else "white" g.vp.size[v] = 10 if v_id in reactionlist else 3 for e in g.edges(): g.ep.arrows[e] = "none" root = gt.find_vertex(g, g.vertex_properties["names"], product)[0] pos = gt.radial_tree_layout(g, g.vertex_index[root]) gt.graph_draw(g, pos, vertex_size=g.vp.size, vertex_text=g.vp.names, vertex_fill_color=g.vp.color, edge_pen_width=0.5, edge_end_marker=g.ep.arrows, output=filename, fit_view=True, output_size=(10000, 10000))
for n, record in enumerate(screed.open(input_fasta)): if n % 1000 == 0: print >>sys.stderr, '...loaded and tagged {} sequences'.format(n) name = record.name sequence = record.sequence ht.consume_sequence_and_tag_with_labels(sequence, n) tags = ht.sweep_tag_neighborhood(sequence, 0) for i in xrange(len(tags) - 1): src = tags[i] dst = tags[i + 1] new = False srcv = gt.find_vertex(sparse_graph, hashes, src) if not srcv: srcv = sparse_graph.add_vertex() hashes[srcv] = src new = True else: srcv = srcv[0] dstv = gt.find_vertex(sparse_graph, hashes, dst) if not dstv: dstv = sparse_graph.add_vertex() hashes[dstv] = dst new = True else: dstv = dstv[0]
def find_system_by_name(name): systems = gt.find_vertex(g, g.vertex_properties['system_name'], name) if len(systems) < 1: bottle.abort(404, "No such system.") return vtx_to_json(systems[0])
def procuraVerticeNoGrafo(g, nome): r = gt.find_vertex(g, g.vp.vertex_name, nome) if (not r): return r else: return r[0]
gprop_vcolour[v] = "blue" else: gprop_vcolour[v] = "white" g.vertex_properties["vcolour"] = gprop_vcolour # create numLinks edge property for g edges eprop_numLinks = g.new_edge_property("int") # creates the edges between nodes for i in linkDict: for n in linkDict[i]: #print(i) vertex_i = gt.find_vertex(g, gprop_label, i)[0] #print(n) try: vertex_n = gt.find_vertex(g, gprop_label, n)[0] e = g.add_edge(vertex_i,vertex_n) eprop_numLinks[e] = linkDict[i][n] except: IndexError ###### EXPERIMENTAL SIZE THINGS ###### # gvprop_size = g.new_vertex_property('float') deleteList = [] for v in g.vertices():
def run(input_file: KGTKFiles, path_file, output_stats, directed, max_hops): def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' try: # import modules locally from kgtk.exceptions import KGTKException import socket from graph_tool import load_graph_from_csv from graph_tool import centrality from graph_tool.all import find_vertex from graph_tool.topology import all_paths import sys import csv from collections import defaultdict csv.field_size_limit(sys.maxsize) id_col = 'name' pairs = [] with open(path_file, 'r') as f: header = next(f) for line in f: src, tgt = line.strip().split('\t') pairs.append((src, tgt)) filename: Path = KGTKArgumentParser.get_input_file(input_file) with open(filename, 'r') as f: header = next(f).strip().split('\t') subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate( header, options=['property', 'predicate', 'label']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) if 'id' not in p: raise KGTKException('Error: no id column found') G = load_graph_from_csv(str(filename), skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) sys.stdout.write('node1\tlabel\tnode2\tid\n') id_count = 0 if not output_stats: for e in G.edges(): sid, oid = e lbl = G.ep[predicate][e] sys.stdout.write( '%s\t%s\t%s\t%s\n' % (G.vp[id_col][sid], lbl, G.vp[id_col][oid], '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count))) id_count += 1 id_count = 0 path_id = 0 paths = defaultdict(set) for pair in pairs: source_node, target_node = pair source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(source_ids) == 1 and len(target_ids) == 1: source_id = source_ids[0] target_id = target_ids[0] for path in all_paths(G, source_id, target_id, cutoff=max_hops, edges=True): for edge_num, an_edge in enumerate(path): edge_id = G.properties[('e', 'id')][an_edge] node1 = 'p%d' % path_id sys.stdout.write( '%s\t%d\t%s\t%s\n' % (node1, edge_num, edge_id, '{}-{}-{}'.format( node1, edge_num, id_count))) id_count += 1 path_id += 1 except Exception as e: raise KGTKException('Error: ' + str(e))
def matchGraphs(graph1, graph2): g1, pos1, weight1, clase1, nodetype1, age1 = graph1 g2, pos2, weight2, clase2, nodetype2, age2 = graph2 vertices1 = g1.get_vertices() vertices2 = g2.get_vertices() age = np.zeros_like(vertices1) for i in range(0, len(vertices1)): age[i] = age1[vertices1[i]] vertices1 = np.argsort(age)[::-1] pos_vertex2 = [] for i in vertices2: pos_vertex2.append(pos2[i]) pos_vertex2 = np.array(pos_vertex2) for i in vertices1: p1 = pos1[i] v, d = find_nearest_nodes(p1, pos_vertex2, 15) v = v[np.argsort(d)] if len(v) == 1: age2[v[0]] = age1[i] + 1 if nodetype2[v[0]] == "null": nodetype2[v[0]] = nodetype1[i] elif len(v) == 2: n_0 = len(g1.get_out_neighbours(i)) n_1 = len(g2.get_out_neighbours(v[0])) n_2 = len(g2.get_out_neighbours(v[1])) dif_a = np.abs(n_1 - n_0) dif_b = np.abs(n_2 - n_0) if dif_a <= dif_b: age2[v[0]] = age1[i] + 1 if nodetype2[v[0]] == "null": nodetype2[v[0]] = nodetype1[i] else: age2[v[1]] = age1[i] + 1 if nodetype2[v[1]] == "null": nodetype2[v[1]] = nodetype1[i] available = np.ones(pos_vertex2.shape[0]) ## If seed is not found => debug seed = gt.find_vertex(g2, nodetype2, "Ini") if len(seed) == 1: seed = seed[0] else: seed_prev = gt.find_vertex(g1, nodetype1, "Ini") p1 = pos1[seed_prev[0]] v, d = find_nearest_b(p1, pos_vertex2) if d < 20: age2[v] = age1[seed_prev[0]] + 1 nodetype2[v] = "Ini" seed = g2.vertex(v) available[v] = 0 else: print('No SEED') raise Exception("BAD TRACKING") pos_vertex2[:, 0] = pos_vertex2[:, 0] * available pos_vertex2[:, 1] = pos_vertex2[:, 1] * available ## If main root tip is not found => debug end = gt.find_vertex(g2, nodetype2, "FTip") if len(end) == 1: end = end[0] else: end_prev = gt.find_vertex(g1, nodetype1, "FTip") p1 = pos1[end_prev[0]] v, d = find_nearest_b(p1, pos_vertex2) if d < 20: age2[v] = age1[end_prev[0]] + 1 nodetype2[v] = "FTip" end = g2.vertex(v) else: v = np.argmax(pos_vertex2[:, 1]) nodetype2[v] = "FTip" end = g2.vertex(v) # print('No TIP') # raise Exception ("BAD TRACKING") vertices2 = g2.get_vertices() for i in vertices2: if age2[i] == 0: age2[i] == 1 if nodetype2[i] == "null": vecinos = g2.get_out_neighbours(i) if len(vecinos) > 1: nodetype2[i] = "Bif" else: if len(vecinos) == 1: nodetype2[i] = "LTip" # edge tracking camino, _ = gt.shortest_path(g2, seed, end, weights=weight2) l = len(camino) for k in range(0, l - 1): arista = g2.edge(camino[k], camino[k + 1]) clase2[arista][1] = 10 return [g2, pos2, weight2, clase2, nodetype2, age2]
def main(): # params IEXCLOUD_TOKEN = os.getenv("IEXCLOUD_TOKEN") # 需要事前提供的檔案 # 在執行時會自動產生的檔案 output_folder = './outputs' downlaods_folder = './downloads' entities_json = f"{output_folder}/wd_entities.json" tk_csv = './downloads/bats_symbols_traded_byx.csv' tk_info_json = "./downloads/iex_ticker_info.json" urls_json = f"{output_folder}/wiki_urls.json" mentions_json = f"{output_folder}/wiki_mentions.json" sent_cooccurs_json = f"{output_folder}/corpus_mentions_sent_cooccurs.json" atk_cooccurs_json = f"{output_folder}/corpus_mentions_atk_cooccurs.json" atk_bags_json = f"{output_folder}/corpus_mentions_atk_bags.json" freqs_json = f"{output_folder}/corpus_mentions_freqs.json" # Wiki processor requires: explore_n_wk_depth: int = 2 # 探索wk的層數 adpot_n_wk_depth: int = 1 # 在n層以內的wk-titles會被實際採用(其他用作graph計算) wkd_dump_json = "./downloads/latest-all.json.bz2" seeded_wk_titles = [] sp500_csv = f"{downlaods_folder}/s_and_p_500.csv" # Wiki processor outputs: wk_titles_graphml = f"{output_folder}/wk_titles.graphml.bz2" wk_pagerank_json = f"{output_folder}/wk_pagerank.json" wk_cat_tags_json = f"{output_folder}/wk_cat_tags.json" # wk_tags_json = f"{output_folder}/wk_tags.json" wk_tags_pagerank_csv = f"{output_folder}/wk_tags_pagerank.csv" wkd_filtered_entities_json = f"{output_folder}/wkd_filtered_entities.json" wk_ranked_titles_json = f"{output_folder}/wk_ranked_titles.json" wkd_entites_by_ranked_titles_json = f"{output_folder}/wkd_entites_by_ranked_titles.json" pathlib.Path(output_folder).mkdir(exist_ok=True) # print(get_matched_wkd_entities(titles, wkd_dump_path=wkd_dump_json)) # entities = load_or_run(wkd_entites_by_ranked_titles_json, # lambda: get_matched_wkd_entities(titles, wkd_dump_path=wkd_dump_json), # forcerun=True) # print("從wikidata取得具有symbol屬性的entities") # results = load_or_run( # entities_json, lambda: query_wikidata_by_property()) # comp_wdids = [e['item']['value'].split('/')[-1] # for e in results['results']['bindings']] # print("找wikidata-entity對應的wikipage") # comp_titles = load_or_run( # comp_titles_json, lambda: query_wikipage_title(comp_wdids)) # return # print("讀取tickers") # df = pd.read_csv(tk_csv) # tickers = list(df['Symbols']) # # tickers = ['ADBE', 'BA', 'RXT', 'TTOO'] # print(f"載入ticker數量: {len(tickers)}") # print("從iexcloud抓ticker info") # infos = load_or_run( # tk_info_json, lambda: download_ticker_info_from_iexcloud(tickers, IEXCLOUD_TOKEN)) # names = [v['companyName'] for k, v in infos.items()] # print("找ticker-info中的公司名搜尋對應的wikipage") # urls = load_or_run( # urls_json, lambda: search_wikipage(names)) # 掃wikipedia-dump,從company的wiki-page開始抓裡面的mentions # 將新加入的mentions設為next_entities,重複抓取n次(=爬n層) # print(f"取得跟公司關聯的mentions - {depth}階層") # titles = [v.split('/')[-1].replace("_", " ") # for _, v in urls.items() if v is not None] print(f"# 連線elasticsearch(用於存放wiki-page, news-corpus)") es.connect(["es:9200"]) print(f"# 以S&P500 wikipage為起點,爬'{explore_n_wk_depth}階層'的titles,建立graph") # seedtitles = ["List of S&P 500 companies"] seedtitles = ["Wilson (company)"] try: # raise FileNotFoundError g = gt.load_graph(wk_titles_graphml) print(f"File loaded: {wk_titles_graphml}") except FileNotFoundError: print(f"File not found, create new one") g = get_wktitles_graph(seedtitles, n_depth=explore_n_wk_depth) for n in g: g.nodes[n]['mentions'] = json.dumps(g.nodes[n]['mentions'], ensure_ascii=False, default=serialize_sets) nx.write_graphml_lxml(g, wk_titles_graphml) g = gt.load_graph(wk_titles_graphml) print("# 使用完整的graph跑pagerank(為避免記憶體不足,改用graph-tool庫)") ranks = load_or_run(wk_pagerank_json, lambda: calc_pagerank(g), forcerun=True) print(f"# 挑出graph中的wiki-category,再找主要描述此category的wiki-title") def _cat_tags() -> Iterable[str]: _, wk_title, _ = zip(*ranks) cats = filter(lambda e: "Category:" in e, wk_title) # print(list(cats)) # print([c for c in cats]) tags = [es.get_corresponded_wktitles(cat_title=c) for c in cats] tags = set(itertools.chain(*tags)) # tags &= set(tags) return tags cat_tags = load_or_run(wk_cat_tags_json, lambda: _cat_tags(), forcerun=True) print(f"# 依照wk-title找尋對應的wkd-entity") # tags = ["Technology", "Internet", "Metal"] cattag_entity = get_matched_wkd_entities(cat_tags) ranks_by_tags = [] for _, wk_title, pagerank in ranks: try: e = cattag_entity[wk_title] ranks_by_tags.append((e.entity_id, e.get_enwiki_title(), e.get_label("zh"), pagerank)) except KeyError: pass print("# 將ranks存成csv格式") wkd_id, wk_title, zh_label, pagerank = zip(*ranks_by_tags) tags = wk_title df = pd.DataFrame({ 'wkd_id': wkd_id, 'wk_title': wk_title, 'zh_label': zh_label, 'pagerank': pagerank }) df.to_csv(wk_tags_pagerank_csv, index=False) return print("# 找一個ticker的tags") def get_neighbors(v: gt.Vertex, n_expands: int = 2): seeds = set([v]) traveled = set() for i in range(n_expands): nextseeds = set() for v in seeds: nextseeds |= set(v.out_neighbors()) nextseeds -= seeds traveled |= seeds seeds = nextseeds return traveled # tags = set(["joint venture"]) tickers = ["Wilson (company)"] tags_by_tickers = [] for tk in tickers: v = gt.find_vertex(g, g.vp['_graphml_vertex_id'], tk)[0] neighbors = get_neighbors(v, n_expands=2) neighbors = set([g.vp['_graphml_vertex_id'][v] for v in neighbors]) tags_by_tickers.append((tk, tags & neighbors)) print(tags_by_tickers) return print(f"tag的排序、重要度、重複性(用max_flow、n_path之類的方式)") # for tk in tickers: # neighbors = get_neighbors(tk) print(f"TODO:巡所有的news,計算mentions的詞頻") # print(f"巡所有的news,計算mentions的詞頻") # TODO: 擴展同義詞(用於flashtext) # print(f"載入S&P500,做為seed-wk-titles") # df = pd.read_csv(sp500_csv) # seedtitles = list(df['Name']) # print(f"以seed-wk-titles為起點,爬'{explore_n_wk_depth}階層'的wk-titles,建立graph") # try: # # raise FileNotFoundError # g = gt.load_graph(wk_titles_graphml) # print(f"File loaded: {wk_titles_graphml}") # except FileNotFoundError: # print(f"File not found, create new one") # g = get_wktitles_graph(seedtitles, n_depth=explore_n_wk_depth) # for n in g: # g.nodes[n]['mentions'] = json.dumps( # g.nodes[n]['mentions'], ensure_ascii=False, default=serialize_sets) # nx.write_graphml_lxml(g, wk_titles_graphml) # g = gt.load_graph(wk_titles_graphml) # print(f"僅採用{adpot_n_wk_depth}-depth的wk-titles") # vp_label = g.vp['_graphml_vertex_id'] # vp_depth = g.vp['depth'] # wktitles = [vp_label[v] # for v in g.vertices() if vp_depth[v] <= adpot_n_wk_depth] # print("掃wkd-dump,將沒有中文名、有位置claim(很可能是地點)、是人的wk-titles排除") # try: # raise FileNotFoundError # entities = WikidataJsonDump(wkd_filtered_entities_json) # filtered_wktitles = set([e.get_enwiki_title() for e in entities]) # print(f"File loaded: {wkd_filtered_entities_json}") # except FileNotFoundError: # print(f"File not found, create new one") # entities = get_matched_wkd_entities( # wktitles, wkd_dump_path=wkd_dump_json) # dump_entities_to_json(entities, wkd_filtered_entities_json) # filtered_wktitles = set([e.get_enwiki_title() for e in entities]) # print("使用完整的graph跑pagerank(為避免記憶體不足,改用graph-tool庫)") # load_or_run(wk_filtered_pagerank_json, # lambda: calc_pagerank(g, included_wktitles=filtered_wktitles), forcerun=True) return
'~/project/KB_dump/conceptnet/conceptnet-en.csv') g = load_graph_from_csv(conceptnet_path, directed=False, eprop_types=['string', 'string'], string_vals=True) prefix = '/c/en/' entities = [ ['capoeira', 'hand', 'cartwheel', 'shirt', 'handstand'], ['sunscreen', 'skateboarding', 'soccer', 'tan', 'rubbing'], ['cream', 'mascara', 'writing', 'lifting', 'dictaphone'], ] blackListVertex = set([ find_vertex(g, prop=g.properties[('v', 'name')], match=prefix + b)[0] for b in ['object', 'thing'] ]) blackListEdge = set(['/r/DerivedFrom', '/r/RelatedTo']) print('#' * 20) for elist in entities: print(elist) qid = find_vertex(g, prop=g.properties[('v', 'name')], match=prefix + elist[0])[0] for a in elist[1:]: aid = find_vertex(g, prop=g.properties[('v', 'name')], match=prefix + a)[0] for vp, ep in zip(
def run(input_file: KGTKFiles, directed, max_hops, source_nodes, target_nodes): def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' try: # import modules locally from kgtk.exceptions import KGTKException import socket from graph_tool import load_graph_from_csv from graph_tool import centrality from graph_tool.all import find_vertex from graph_tool.topology import all_paths import sys from collections import defaultdict id_col = 'name' graph_edge = 'graph' filename: Path = KGTKArgumentParser.get_input_file(input_file) filename = str(filename) with open(filename, 'r') as f: header = next(f).split('\t') subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate( header, options=['property', 'predicate', 'label']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) if 'id' not in p: raise KGTKException('Error: no id column found') G = load_graph_from_csv(filename, skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) graph_id = 1 paths = defaultdict(set) for source_node in source_nodes: source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) if len(source_ids) == 1: source_id = source_ids[0] for target_node in target_nodes: target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(target_ids) == 1: target_id = target_ids[0] for path in all_paths(G, source_id, target_id, cutoff=max_hops, edges=True): for an_edge in path: edge_id = G.properties[('e', 'id')][an_edge] paths[edge_id].add(str(graph_id)) graph_id += 1 sys.stdout.write('node1\tlabel\tnode2\tid\t%s\n' % graph_edge) for e in G.edges(): sid, oid = e edge_id = G.properties[('e', 'id')][e] lbl = G.ep[predicate][e] graph_id = '|'.join(list(paths[edge_id])) sys.stdout.write( '%s\t%s\t%s\t%s\t%s\n' % (G.vp[id_col][sid], lbl, G.vp[id_col][oid], edge_id, graph_id)) except Exception as e: raise KGTKException('Error: ' + str(e))
def neighbor_graphs(self): for o in self.organisms: builder = self.builder[o] cofactors = set([c.id for c in builder.cofactor_list]) g = self.builder[o].arcgraph model_reactions = { r.id: r for r in self.builder[o].model.reactions } has_start_metabolite_in_graph = list() for nf in self.not_found[o]: g.set_reversed(True) root = gt.find_vertex(g, g.vertex_properties["compound_ids"], nf)[0] helper.add_properties(g) g.vp.size[root] = 100 old_color = g.vp.color[root] g.vp.color[root] = "red" g.vp.besucht[root] = True visitor = helper.NeighborVisitor(g) gt.bfs_search(g, root, visitor) g.set_reversed(False) g.set_vertex_filter(g.vp.besucht) g.vp.size.a = 90 * 0.9**g.vp.dist.a + 10 if any([v for v in g.vertices() if g.vp.color[v] == "yellow"]): has_start_metabolite_in_graph.append(nf) # reset graph g.clear_filters() helper.remove_properties(g) g.vp.color[root] = old_color feasible_reactions = self.builder[o].feasible_reaction_list self.really_not_found[o] = has_start_metabolite_in_graph with open(join(const.MECAT_BASE, 'Targets '+o,'has_start_metabolite_in_graph.txt'),'w') as f1, \ open(join(const.MECAT_BASE, 'Targets ' + o, 'reactions_for_really_not_found.txt'), 'w') as f2: rrnf = list() substrates_not_found = set(has_start_metabolite_in_graph + self.not_found[o] + self.really_not_found[o]) for c in has_start_metabolite_in_graph: f1.write('{}\t{}\n'.format(c, self.compounds[c].names[0])) reactions_for_target = self.__reactions_for_target( c, model_reactions) fr = set(reactions_for_target) & feasible_reactions if len(fr) > 0: rrnf.append(c) f2.write('{}\t{}:\n'.format(c, self.compounds[c].names[0])) for reac_id in fr: t = '' # test if the substrates of the reaction are all cofactors substrates = set( next( itertools.ifilter( lambda x: x.id == reac_id, builder.model.reactions)).substrates()) if substrates <= cofactors: t += ' keine Kante (Cofactors)' # test if the reaction has arcs that end with the target rpairs = builder.reaction_rpair[reac_id] if not any((x[1] == c) for x in rpairs): t += ' keine Kante zu Target' # test if any substrate is also in list of not found if substrates & substrates_not_found: t += ' Substrat auch nicht gefunden' fx = re.findall('R\d{5}', reac_id)[0] f2.write('\t{}\t{}\t{}\n'.format( t, reac_id, self.reactions[fx].definition)) self.really_really_not_found[o] = rrnf self.__dump_statistics( self.organisms, join(const.MECAT_BASE, 'targets_in_organisms.csv'))
def local_cells(self): for vertex in gt.find_vertex(self.eptm.graph, self.eptm.is_local_vert, 1): if self.eptm.is_cell_vert[vertex]: yield vertex
def set_precursors(self, precursors=None, complex_loopback=True): """ Replaces network's previous precursor node with provided new precursors. Finds new edges that link products back to reactants as dependent on the complex_loopback parameter. Args: precursors ([ComputedEntry]): list of new precursor entries complex_loopback (bool): if True, adds zero-weight edges which "loop back" to allow for multi-step or autocatalytic-like reactions, i.e. original precursors can reappear many times and in different steps. Returns: None """ g = self._g self._precursors = set(precursors) if precursors else None if not self._precursors: precursors_entries = RxnEntries(None, "d") # use dummy precursors node if complex_loopback: raise ValueError( "Complex loopback can't be enabled when using a dummy precursors " "node!") else: precursors_entries = RxnEntries(precursors, "s") g.remove_vertex(gt.find_vertex(g, g.vp["type"], 0)) new_precursors_v = g.add_vertex() self._update_vertex_properties( g, new_precursors_v, { "entries": precursors_entries, "type": 0, "bool": True, "path": True, "chemsys": precursors_entries.chemsys, }, ) new_edges = [] remove_edges = [] for v in gt.find_vertex(g, g.vp["type"], 1): # iterate over all reactants phases = g.vp["entries"][v].entries remove_edges.extend(list(v.in_edges())) if precursors_entries.description == "D" or phases.issubset( self._precursors): new_edges.append([new_precursors_v, v, 0, None, True, False]) for v in gt.find_vertex(g, g.vp["type"], 2): # iterate over all products phases = g.vp["entries"][v].entries if complex_loopback: combos = generate_all_combos(phases.union(self._precursors), self._max_num_phases) else: combos = generate_all_combos(phases, self._max_num_phases) for c in combos: combo_phases = set(c) if complex_loopback and combo_phases.issubset( self._precursors): continue combo_entry = RxnEntries(combo_phases, "R") loopback_v = gt.find_vertex(g, g.vp["entries"], combo_entry)[0] new_edges.append([v, loopback_v, 0, None, True, False]) for e in remove_edges: g.remove_edge(e) g.add_edge_list( new_edges, eprops=[g.ep["weight"], g.ep["rxn"], g.ep["bool"], g.ep["path"]])
def run( input_file: KGTKFiles, path_file: KGTKFiles, output_file: KGTKFiles, statistics_only: bool, undirected: bool, max_hops: int, source_column_name: typing.Optional[str], target_column_name: typing.Optional[str], shortest_path: bool, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool.all import find_vertex from graph_tool.topology import all_paths from graph_tool.topology import all_shortest_paths from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions from kgtk.exceptions import KGTKException try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="input", fallback=True) path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="path", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) id_col = 'name' if verbose: print("Reading the path file: %s" % str(path_kgtk_file), file=error_file, flush=True) pairs = [] pkr: KgtkReader = KgtkReader.open( path_kgtk_file, error_file=error_file, options=path_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) path_source_idx: int = pkr.get_node1_column_index(source_column_name) if path_source_idx < 0: print("Missing node1 (source) column name in the path file.", file=error_file, flush=True) path_target_idx: int = pkr.get_node2_column_index(target_column_name) if path_target_idx < 0: print("Missing node1 (target) column name in the path file.", file=error_file, flush=True) if path_source_idx < 0 or path_target_idx < 0: pkr.close() raise KGTKException("Exiting due to missing columns.") paths_read: int = 0 path_row: typing.List[str] for path_row in pkr: paths_read += 1 if len(path_row) != pkr.column_count: raise KGTKException( "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read." % (paths_read, str(path_kgtk_file), pkr.column_count, len(path_row))) src: str = path_row[path_source_idx] tgt: str = path_row[path_target_idx] pairs.append((src, tgt)) pkr.close() if verbose: print("%d path rows read" % paths_read, file=error_file, flush=True) if len(pairs) == 0: print("No path pairs found, the output will be empty.", file=error_file, flush=True) elif verbose: print("%d path pairs found" % len(pairs), file=error_file, flush=True) if verbose: print("Reading the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=input_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub_index: int = kr.get_node1_column_index() if sub_index < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred_index: int = kr.get_label_column_index() if pred_index < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj_index: int = kr.get_node2_column_index() if obj_index < 0: print("Missing node2 (object) column", file=error_file, flush=True) id_index: int = kr.get_id_column_index() if id_index < 0: print("Missing id column", file=error_file, flush=True) if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred_index] id_col_name: str = kr.column_names[id_index] G = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub_index, obj_index), verbose=verbose, out=error_file) output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id'] kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) id_count = 0 if not statistics_only: for e in G.edges(): sid, oid = e lbl = G.ep[predicate][e] kw.write([ G.vp[id_col][sid], lbl, G.vp[id_col][oid], '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count) ]) id_count += 1 if verbose: print("%d edges found." % id_count, file=error_file, flush=True) id_count = 0 path_id = 0 for pair in pairs: source_node, target_node = pair source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(source_ids) == 1 and len(target_ids) == 1: source_id = source_ids[0] target_id = target_ids[0] if shortest_path: _all_paths = all_shortest_paths(G, source_id, target_id, edges=True) else: _all_paths = all_paths(G, source_id, target_id, cutoff=max_hops, edges=True) for path in _all_paths: for edge_num, an_edge in enumerate(path): edge_id = G.properties[('e', 'id')][an_edge] node1: str = 'p%d' % path_id kw.write([ node1, str(edge_num), edge_id, '{}-{}-{}'.format(node1, edge_num, id_count) ]) id_count += 1 path_id += 1 if verbose: print("%d paths contining %d edges found." % (path_id, id_count), file=error_file, flush=True) kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def Stochastic(): import pandas as pd import numpy as np import pprint as pp import locale import matplotlib.pyplot as plt import matplotlib.ticker as tkr import graph_tool.all as gt import math # Need to drag this out into the real world from GAC_Graph_Builder import findEdges t = gt.Graph(directed=True) tprop_label = t.new_vertex_property("string") tprop_instType = t.new_vertex_property("string") linkDict, instSet = findEdges() # ingest our university checking lists [this is sloppy, TBI] foreignUniTxt = open('Workaround txts/Foreign Unis.txt', 'r') UKUniTxt = open('Workaround txts/UK Unis.txt', 'r') forerignUniVals = foreignUniTxt.read().splitlines() UKUniVals = UKUniTxt.read().splitlines() # add vertices and label them based on their names. ######## FILTERING BASED ON CORDIS RESIDENCY ########## dfCordisNames = pd.read_pickle('Pickles/CORDIS_Countries.pickle') eligiblenames = dfCordisNames.name.values.tolist() veryDirtyWorkaround = ['FOCUS', 'FLUOR', 'GE', 'NI', 'OTE', 'ROKE'] for inst in instSet: nameCheck = inst.upper() firstFound = next((x for x in eligiblenames if nameCheck in x), None) if inst in forerignUniVals: del (linkDict[inst]) elif nameCheck in veryDirtyWorkaround: del (linkDict[inst]) elif firstFound is None: del (linkDict[inst]) else: vert = t.add_vertex() tprop_label[vert] = str(inst) del (linkDict['']) # internalise property map t.vertex_properties["label"] = tprop_label # explicitly declare the hierarchy defining vertices and edges, the sequencing here matters. for_uni = t.add_vertex() UK_uni = t.add_vertex() other = t.add_vertex() root = t.add_vertex() edgeList = [(root, for_uni), (root, UK_uni), (root, other)] t.add_edge_list(edgeList) # use label name to add edges to hierarchy for i in range(t.num_vertices())[:-4]: if tprop_label[i] in forerignUniVals: t.add_edge(for_uni, t.vertex(i)) tprop_instType[i] = "Foreign Uni" elif tprop_label[i] in UKUniVals: t.add_edge(UK_uni, t.vertex(i)) tprop_instType[i] = "UK Uni" else: t.add_edge(other, t.vertex(i)) tprop_instType[i] = "Other Institution" t.vertex_properties["instType"] = tprop_instType tpos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), rel_order_leaf=True) ######### MAIN GRAPH DRAWING ################ g = gt.Graph(directed=False) # creates graph g, using the same nodes (with the same index!) for v in t.vertices(): gv = g.add_vertex() # we remove: root, for_uni, uk_uni or 'other' vertices lower = g.num_vertices() - 5 current = g.num_vertices() - 1 while current > lower: g.remove_vertex(current) current -= 1 # Pull vertex properties from t labelDict = t.vertex_properties["label"] instTypeDict = t.vertex_properties["instType"] # create properties for g vertices gprop_label = g.new_vertex_property("string") gprop_instType = g.new_vertex_property("string") # match labels between g and t for v in g.vertices(): gprop_label[v] = labelDict[v] gprop_instType[v] = instTypeDict[v] # make property map internal to graph g g.vertex_properties["label"] = gprop_label g.vertex_properties["instType"] = gprop_instType ###### COLOUR VERTICES ######### # Reclaim variable names because lazy gprop_vcolour = g.new_vertex_property("string") for v in g.vertices(): if gprop_instType[v] == "Foreign Uni": gprop_vcolour[v] = "red" elif gprop_instType[v] == "UK Uni": gprop_vcolour[v] = "blue" else: gprop_vcolour[v] = "white" g.vertex_properties["vcolour"] = gprop_vcolour # create numLinks edge property for g edges eprop_numLinks = g.new_edge_property("int") # creates the edges between nodes for i in linkDict: for n in linkDict[i]: #print(i) vertex_i = gt.find_vertex(g, gprop_label, i)[0] #print(n) try: vertex_n = gt.find_vertex(g, gprop_label, n)[0] e = g.add_edge(vertex_i, vertex_n) eprop_numLinks[e] = linkDict[i][n] except: IndexError ##### EXPERIMENTAL SIZE THINGS ###### #gvprop_size = g.new_vertex_property('float') deleteList = [] for v in g.vertices(): # sum the num edges and the number of links they correspond to # use this to find a ratio and scale size off of this. numEdges = sum(1 for _ in v.all_edges()) numLinks = 0 for e in v.all_edges(): numLinks += eprop_numLinks[e] #print(gprop_label[v]) print("NumEdges = " + str(numEdges) + " NumLinks = " + str(numLinks)) # create a delete list try: ratio = (numLinks / numEdges) * 5 * 2 except: ZeroDivisionError deleteList.append(v) #gvprop_size[v] = ratio #g.vertex_properties['size'] = gvprop_size #### Delete linkless vertices ####### for v in reversed(sorted(deleteList)): g.remove_vertex(v) for v in reversed(sorted(deleteList)): t.remove_vertex(v) tpos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), rel_order_leaf=True) ####### ############ stochastic BLOCK MODEL #################### state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True, verbose=True) t = gt.get_hierarchy_tree(state)[0] tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True) # in order to make sure labels fit in the image we have to manually adjust the # co-ordinates of each vertex. x, y = gt.ungroup_vector_property(tpos, [0, 1]) x.a = (x.a - x.a.min()) / (x.a.max() - x.a.min()) * 1400 + 400 y.a = (y.a - y.a.min()) / (y.a.max() - y.a.min()) * 1400 + 400 tpos = gt.group_vector_property([x, y]) # This draws the 'Bezier spline control points' for edges # it draws the edges directed in graph g, but uses the hierarchy / positioning of graph t. cts = gt.get_hierarchy_control_points(g, t, tpos) pos = g.own_property(tpos) gt.graph_draw( g, vertex_text_position="centered", vertex_text=g.vertex_properties["label"], vertex_font_size=14, vertex_anchor=0, vertex_aspect=1, vertex_shape="square", vertex_fill_color=g.vertex_properties["vcolour"], vertex_size=10, fit_view=False, # edge_color=g.edge_properties["colour"], # edge_pen_width=g.edge_properties["thickness"], edge_end_marker="none", edge_pen_width=0.2, edge_color="white", bg_color=[0, 0, 0, 1], output_size=[2000, 2000], output='UK_ONLY_RELATIONSHIPS_stochastic.png', pos=pos, edge_control_points=cts) if __name__ == '__main__': pyjd.setup("Hello.html")
def find_nodes(self,prop,match): return gt.find_vertex(self.graph,prop,match)
def topology(self): g = self.arcgraph.copy() components, chist = gt.label_components( g, directed=False ) # directed = False because True would look for strongly connected components self.__plot_component_hist(chist, 'componenthist') start_components = set() number_compounds_in_start_components = 0 for c in self.start_compounds: for v in gt.find_vertex(g, g.vp.compound_ids, c): start_components.add(components[v]) cg = gt.Graph() cg.vertex_properties["size"] = cg.new_vertex_property("int", val=10) for c in start_components: v = cg.add_vertex() cg.vp.size[v] = chist[c] number_compounds_in_start_components += chist[c] satellites = set() clustering_coefficient = gt.global_clustering(g) with open(join(self.statistics_path, "clustering_coefficient.txt"), 'w') as f: f.write( str(clustering_coefficient[0]) + '\t' + str(clustering_coefficient[1]) + '\n') with open(join(self.statistics_path, "compounds_components.txt"), 'w') as f, \ open(join(self.statistics_path, "component_hist.txt"), 'w') as f2: for componentid, elem in enumerate(chist): u = gt.GraphView(g, vfilt=components.a == componentid) u = gt.Graph(u, prune=True) f2.write(str(componentid + 1) + '\t' + str(elem) + '\n') for v in u.vertices(): f.write( str(componentid + 1) + '\t' + u.vp.compound_ids[v] + '\t' + u.vp.name[v] + '\n') if componentid not in start_components: satellites.add(u.vp.compound_ids[v]) # gt.graph_draw(u, output=join(self.statistics_path, "component{i}.pdf".format(i=componentid))) targets_in_main_component = self.targets - satellites targets_in_satellites = self.targets & satellites with open(join(self.statistics_path, "targets_in_main_component.txt"), 'w') as f: for c in targets_in_main_component: compound = self.builder.compounds[c] f.write(c + '\t' + compound.names[0] + '\n') with open(join(self.statistics_path, "targets_in_satellites.txt"), 'w') as f: for c in targets_in_satellites: compound = self.builder.compounds[c] f.write(c + '\t' + compound.names[0] + '\n') with open( join(self.statistics_path, "components_with_start_metabolites.txt"), 'w') as f: for cid in start_components: f.write(str(cid) + '\n') p = number_compounds_in_start_components / g.num_vertices() * 100 with open(join(const.MECAT_BASE, "component_table.txt"), 'a') as f: f.write(self.name + ' & ' + str(len(chist)) + ' & ' + str(np.amax(chist)) + ' & ' + str(len(start_components)) + ' & ' + str(int(number_compounds_in_start_components)) + ' & ' + str(int(round(p, 0))) + '\%' + '\\\\ \n') #largest = gt.label_largest_component(g, directed=False) #gt.graph_draw(g, vertex_fill_color=largest, output=join(self.statistics_path,"largest_component.pdf")) g.vertex_properties["start_components"] = g.new_vertex_property( "string", val='white') for v in g.vertices(): if components[v] in start_components: g.vp.start_components[v] = 'red' else: g.vp.start_components[v] = 'blue' gt.graph_draw(g, vertex_fill_color=g.vp.start_components, output=join('/mnt', 'g', 'LisaDaten', 'Paper2', 'figures', 'arcgraph' + self.name + '.pdf'))
#add username property map name_prop = g.new_vertex_property("string") g.vertex_properties['name'] = name_prop #add names to each vertex for v in g.vertices(): g.vp.name[v] = cursor[g.vertex_index[v]]['user'] #print(g.vertex_properties['name'][v]) cursor.rewind() #create all edges for user in cursor: #v1 is the vertex where name = cursor['user'] v1 = gt.find_vertex(g, g.vp.name, user['user'])[0] for mention in user['user_mentions']: try: # v2 is the vertex where name = mention v2 = gt.find_vertex(g, g.vp.name, mention)[0] except IndexError: print("Error: " + mention + " is not in the collection") continue if g.vp.name[v1] != g.vp.name[v2]: print("adding edge between " + g.vp.name[v1] + " and " + g.vp.name[v2]) edge = g.add_edge(v1, v2) #attempt at weighting the graph pos = gt.sfdp_layout(g)