def work(num=None): if num == None: g = gt.load_graph("../data/graphAll.xml.gz") else: g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") res = gt.stats.distance_histogram(g) #May cost much time. fig = plt.figure() plt.plot(res[0], label="Distance distribution") plt.legend(loc="upper right") plt.xlabel("Distance") plt.ylabel("Count") if num == None: fig.savefig("../pic/distance.png") else: fig.savefig("../pic/distance" + str(num) + ".png") fig = plt.figure() res[0][0] = 1 plt.plot(np.log10(res[0]), label="Log-distance distribution") plt.legend(loc="upper right") plt.xlabel("Distance") plt.ylabel("Log-count") if num == None: fig.savefig("../pic/log-distance.png") else: fig.savefig("../pic/log-distance" + str(num) + ".png") max_distance = max(res[1]) - 1 avg_distance = np.sum(res[0] * res[1][:-1]) / np.sum(res[0]) print 'max_distance: ' + str(max_distance) print 'avg_distance: ' + str(avg_distance)
def prepare_input_graph(graphName, metric, verbose=True): storedFolder = roles.graph_folder(graphName) try: inGraph = IO.load_data("../Data/Graphs/" + storedFolder + "/" + graphName + ".GT.graph").next() except: inGraph = gt.load_graph("../Data/Graphs/" + storedFolder + "/" + graphName + ".graph.xml") if verbose: print "Loaded Input Graph.\nName = %s. \nMetric = %s. \n#Nodes = %d. #Edges = %d." % ( graphName, metric, inGraph.num_vertices(), inGraph.num_edges()) groupTaxa, blackList = roles.graph_node_clusters(graphName, inGraph, metric) xTickMarks = roles.taxa_names(graphName) if verbose: if blackList != None: print "True Number of Clusters = " + str( len(set(groupTaxa)) - len(blackList)) + "\n" else: print "True Number of Clusters = " + str(len( set(groupTaxa))) + "\n" return inGraph, groupTaxa, blackList, xTickMarks
def work(num): g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") vp, ep = gt.centrality.betweenness(g) fig = plt.figure() res = plt.hist(vp.a, label="Betweenness of vertices", bins=100) plt.xlim(0, 0.04) plt.ylim(0, 250) plt.legend(loc="upper right") plt.xlabel("Betweenness") plt.ylabel("Count") fig.savefig('../pic/betweenness_vertices' + str(num) + '.png') print 'max of betweenness_vertices: ' + str(np.max(res[1])) print 'mean of betweenness_vertices: ' + str(np.mean(res[1])) print 'var of betweenness_vertices: ' + str(np.var(res[1])) fig = plt.figure() res = plt.hist(ep.a, label="Betweenness of edges", bins=100) plt.xlim(0, 0.015) plt.ylim(0, 300) plt.legend(loc="upper right") plt.xlabel("Betweenness") plt.ylabel("Count") fig.savefig('../pic/betweenness_edges' + str(num) + '.png') print 'max of betweenness_edges: ' + str(np.max(res[1])) print 'mean of betweenness_edges: ' + str(np.mean(res[1])) print 'var of betweenness_edgees: ' + str(np.var(res[1]))
def metrics(file, use_cache=True): # use cache or recompute cache = os.path.splitext(file)[0] + ".json" if use_cache and os.path.isfile(cache): print('using cached metrics for', os.path.basename(file)) with open(cache, "r") as fp: return json.load(fp) print('computing metrics for', os.path.basename(file)) # read file g = load_graph(file) degrees = list(g.degree_property_map("out")) with open(file) as f: metalines = [next(f) for x in range(13)] # gather data metrics = {} metrics['file'] = os.path.basename(file) metrics['edges'] = int(metalines[5].split()[-1]) metrics['rounds'] = int(metalines[1].split()[-1]) metrics['max_degree'] = max(degrees) metrics['avg_degree'] = mean(degrees) metrics['min_degree'] = min(degrees) metrics['local_clustering'] = mean(local_clustering(g).get_array()) metrics['global_clustering'] = global_clustering(g)[0] metrics['pseudo_diameter'] = int(pseudo_diameter(g)[0]) fit = powerlaw.Fit(degrees, discrete=True, verbose=False) metrics['exponent'] = fit.alpha metrics['KS'] = fit.power_law.KS() metrics['x_min'] = fit.xmin with open(cache, "w") as fp: json.dump(metrics, fp) return metrics
def load_graph(self, value): stream = StringIO(value) try: g = load_graph(stream, fmt="graphml") except OSError as err: raise ValidationError("data is not correctly formatted as graphml (xml)") return g
def extract_distances(fold, base_filename, name, exclude_borders=1): """ Extracts distances information from a .gt file into a .csv file. By default, values within 1 (in units of the graph) to surface borders are excluded. Args: fold (str): path where the input is and where the output will be written base_filename (str): base file name for input and output files name (str): name of the property to extract (e.g., 'PMdistance' or 'cERthickness') exclude_borders (int, optional): if > 0, triangles within this distance from borders and corresponding values will be excluded from the output files (graph .gt, surface.vtp file and .csv) Returns: None """ # input graph and surface files gt_infile = '{}{}.gt'.format(fold, base_filename) # output csv, gt and vtp files csv_outfile = '{}{}.csv'.format(fold, base_filename) gt_outfile = None vtp_outfile = None if exclude_borders > 0: eb = "_excluding{}borders".format(exclude_borders) gt_outfile = '{}{}{}.gt'.format(fold, base_filename, eb) csv_outfile = '{}{}{}.csv'.format(fold, base_filename, eb) vtp_outfile = '{}{}{}.vtp'.format(fold, base_filename, eb) # Create TriangleGraph object and load the graph file tg = TriangleGraph() tg.graph = load_graph(gt_infile) _extract_distances_from_graph(tg, csv_outfile, exclude_borders, name, gt_outfile, vtp_outfile)
def findCommunities(filename): trials = 1 fullFile = f'{filename}.graphml' print(fullFile) graph = gt.load_graph(fullFile) lowest_entropy = np.inf best_community = None for i in range(trials): state = inference.minimize_blockmodel_dl(graph, deg_corr=True, verbose=True) b = state.get_blocks() print(state.entropy()) if state.entropy() < lowest_entropy: best_community = b lowest_entropy = state.entropy() communityMapping = dict() nodeList = list() communityID = list() for v in graph.vertices(): nodeList.append(str(graph.vertex_properties["_graphml_vertex_id"][v])) communityID.append(str(best_community[v])) communityMapping['NODE_ID'] = nodeList communityMapping['COMMUNITY_ID'] = communityID return communityMapping
def work(num): g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") state = gt.inference.minimize_blockmodel_dl(g) deg = g.degree_property_map("in") deg.a = 4 * (np.sqrt(deg.a) * 0.5 + 0.4) ebet = gt.centrality.betweenness(g)[1] ebet.a /= ebet.a.max() / 10. eorder = ebet.copy() eorder.a *= -1 state.draw(vertex_shape=state.get_blocks(), output="../pic/blockmodel" + str(num) + ".pdf", vertex_size=deg, vertex_fill_color=deg, vorder=deg, edge_color=ebet, eorder=eorder, edge_pen_width=ebet) state.draw(vertex_shape=state.get_blocks(), output="../pic/blockmodel" + str(num) + ".png", vertex_size=deg, vertex_fill_color=deg, vorder=deg, edge_color=ebet, eorder=eorder, edge_pen_width=ebet)
def _read_graph(self): from graph_tool import load_graph, Graph logger.info(f'import graphml file from {self.config.graph_path}') self.graph: Graph = load_graph(self.config.graph_path) if not self.config.directed: logger.info('Converting to undirected graph') self.graph.set_directed(False)
def work(num): g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") pos = gt.draw.sfdp_layout(g) gt.draw.graph_draw(g, pos=pos, output="../pic/sfdp_layout" + str(num) + ".pdf") gt.draw.graph_draw(g, pos=pos, output="../pic/sfdp_layout" + str(num) + ".png") pos = gt.draw.arf_layout(g, max_iter=0) gt.draw.graph_draw(g, pos=pos, output="../pic/arf_layout" + str(num) + ".pdf") gt.draw.graph_draw(g, pos=pos, output="../pic/arf_layout" + str(num) + ".png") pos = gt.draw.radial_tree_layout(g, g.vertex(0)) gt.draw.graph_draw(g, pos=pos, output="../pic/radial_tree_layout" + str(num) + ".pdf") gt.draw.graph_draw(g, pos=pos, output="../pic/radial_tree_layout" + str(num) + ".png")
def load(cls, path=config.BUS_ROAD_GRAPH_PATH): """ Loads a graphtool graph """ gtG = load_graph(path) inst = cls() inst.gtG = gtG return inst
def load_data(): import graph_tool as gt graphs = {} LOG.info("Loading graph data from %s", DIR) for fname in DIR.glob("*.gt"): graphs[fname.stem] = gt.load_graph(str(fname)) return graphs
def get_graph(arg_n=1): file = sys.argv[arg_n] if len(sys.argv) > arg_n else None if file is None: return False elif file.split('.')[-1] == 'txt': directed = sys.argv[arg_n + 1].lower() == "d" return load_graph_from_raw(file, directed) else: return GT.load_graph(file)
def country_network(name, year, check_fields={}, n_neigh=6): graph_file_name = name + '_' + str(year) + '_k' + str(n_neigh) + '.xml' if os.path.exists(graph_file_name): return gt.load_graph(graph_file_name) if name == 'US' and check_fields == {}: states = ['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'] check_fields = {'STATE': states} station_codes = [] pos = [] with open(os.path.join(folder_name, 'isd-history.csv'), 'r') as f: reader = csv.DictReader(f) for row in reader: functional = int(row['BEGIN'][:4]) <= year <= int(row['END'][:4]) lon = row['LON'] lat = row['LAT'] if row['CTRY'] == name and functional and lon and lat: if not all([row[field] in check_fields[field] for field in check_fields]): continue station_codes.append((row['USAF'], row['WBAN'])) pos.append([float(lon), -float(lat)]) station_values, missing = read_stations(station_codes, year) pos = np.delete(np.array(pos), missing, axis=0) weights = squareform(pdist(pos)) weights = np.exp(-weights / np.median(weights)) idx_sorted = np.argsort(weights) idx_sorted = idx_sorted[:, -n_neigh-1:-1] print idx_sorted.shape graph = gt.Graph(directed=False) graph.add_vertex(n=pos.shape[0]) e_weights = graph.new_edge_property('double', vals=1) for i in range(weights.shape[0]): w_i = weights[i, idx_sorted[i, :]] for j in range(n_neigh): e = graph.edge(i, idx_sorted[i, j], new=True) e_weights[e] = w_i[j] v_pos = graph.new_vertex_property('vector<double>', vals=pos) v_values = graph.new_vertex_property('double', vals=station_values) graph.vertex_properties['pos'] = v_pos graph.vertex_properties['station_values'] = v_values graph.edge_properties['weights'] = e_weights graph.save(graph_file_name) return graph
def test_load_graph_from_file(self, here): """Should load the graph from the test.graphml file in tests/data/test.graphml""" filepath = os.path.abspath(os.path.join(here, 'data', 'test.graphml')) fmt = filepath.split('.')[-1].strip() with open(filepath, 'rb') as f: g = gt.load_graph(f, fmt=fmt) # make sure there are vertices vertices = list(g.vertices()) assert len(vertices) > 200
def from_expe(cls, expe, corpus=None, load=True, save=True): if '_force_load_data' in expe: load = expe._force_load_data if '_force_save_data' in expe: save = expe._force_save_data input_path = cls.get_input_path(expe) data = None fn = cls._resolve_filename(expe) target_file_exists = os.path.exists(fn) if load is False or not target_file_exists: # Data loading Strategy if not data: try: # Load from graph-tool Konnect repo from graph_tool import collection data = gt.load_graph(collection.get_data_path(expe.corpus)) os.makedirs(os.path.join(input_path), exist_ok=True) except FileNotFoundError as e: pass except Exception as e: cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e)) raise e if not data: try: from urllib.error import HTTPError from tarfile import ReadError # Load from graph-tool Konnect site data = gt.collection.konect_data[expe.corpus] data = cls._clean_data_konect(expe, data) os.makedirs(os.path.join(input_path), exist_ok=True) except (HTTPError, OSError, ReadError) as e: pass except Exception as e: cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e)) raise e if not data: # Load manually from file data = cls._extract_data_file(expe, corpus=corpus) if save: # ===== save ==== cls._save_data(fn, data) else: # ===== load ==== data = cls._load_data(fn) return cls(expe, data, corpus=corpus)
def draw_community(gml_fn, output, layout_name=None, layout_kwargs=dict(), **draw_kwargs): g = load_graph(gml_fn) # Sampel of graph g # g = GraphView(g, vfilt=lambda v: g.vertex_index[v]%2==0) g.vp['wdeg'] = g.degree_property_map('total', weight=g.ep['weight']) # g = GraphView(g, vfilt=lambda v: g.vp['wdeg'][v]>0) # label for hub account only in each community g.vp['clabel'] = g.new_vertex_property("string", val="") for c in np.nditer(np.unique(g.vp['community'].a)): cg = GraphView(g, vfilt=(g.vp['community'].a == c)) v_hub = find_vertex(cg, cg.vp['wdeg'], cg.vp['wdeg'].fa.max())[0] cg.vp['clabel'][v_hub] = cg.vp['screenname'][v_hub] v_size = prop_to_size( g.vp['wdeg'], mi=MIN_V_SIZE, ma=MAX_V_SIZE, log=V_SIZE_LOG, power=V_SIZE_POWER) e_width = prop_to_size( g.ep['weight'], mi=MIN_E_WIDTH, ma=MAX_E_WIDTH, log=E_WIDTH_LOG, power=E_WIDTH_POWER) if layout_name is not None: try: pos = globals()[layout_name](g, **layout_kwargs) except KeyError as e: logger.critical('No such layout function found!') raise graph_draw( g, pos, output=output, vprops=dict( fill_color=g.vp['community'], # color='grey', size=v_size, pen_width=0.01, text=g.vp['clabel'], text_position='centered', font_size=8,), eprops=dict( pen_width=e_width, end_marker="arrow",), **draw_kwargs)
def __init__(self, infile=None, fmt='dot', outfile=None): if infile is not None: super().__init__(graph_tool.load_graph(infile, fmt)) self.root = self.vertex(0) else: super().__init__() self.root = self.add_vertex() self.ep['label'] = self.new_edge_property('string') if outfile is None: self.outfile = sys.stdout else: self.outfile = open(outfile, 'w')
def load_graph(seed=None): if BASENAME.startswith('soc'): rw.read_original_graph(BASENAME, seed=seed, balanced=BALANCED) redensify.G = deepcopy(rw.G) redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN) elif DATA == 'LP': _ = persistent.load_var(BASENAME+'.my') redensify.G, redensify.EDGES_SIGN = _ return else: G = gt.load_graph(BASENAME+'.gt') cexp.to_python_graph(G)
def work(num=None): if num == None: g = gt.load_graph("../data/graphAll.xml.gz") else: g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") pr = gt.centrality.pagerank(g) fig = plt.figure() plt.hist(np.log10(pr.a), label="PageRank", bins=100) plt.legend(loc="upper right") plt.xlabel("PageRank") plt.ylabel("Count") if num == None: fig.savefig('../pic/pagerank.png') else: fig.savefig('../pic/pagerank' + str(num) + '.png') print 'max of pagerank: ' + str(np.max(pr.a.tolist())) print 'min of pagerank: ' + str(np.min(pr.a.tolist())) print 'mean of pagerank: ' + str(np.mean(pr.a.tolist())) print 'variance of pagerank: ' + str(np.var(pr.a.tolist()))
def add_metrics(graph_path, metrics): def get_metric(ggt, metric, n_nodes, n_edges): if "d" == metric: # Density if n_nodes <= 1: value = 0.0 else: value = ( 2.0 * n_edges ) / ( n_nodes * (n_nodes - 1.0) ) ggt.gp[metric] = ggt.new_gp("float", val=value) elif "dg" == metric: # Degree if n_nodes <= 1: value = np.zeros(n_nodes, dtype=np.float32) else: value = ggt.degree_property_map('total').get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "dgc" == metric: # Degree centrality if n_nodes <= 1: value = np.zeros(n_nodes, dtype=np.float32) else: value = ggt.degree_property_map('total').get_array() / (n_nodes - 1.0) ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "cnw" == metric: # Clustering coefficient ( non-weighted ) value = local_clustering(ggt).get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "cw" == metric: # Clustering coefficient ( weighted ) value = local_clustering(ggt, weight=ggt.ep.weight).get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "pgr" == metric: # Page Rank value = pagerank(ggt).get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) ggt = gt.load_graph(str(graph_path)) time = int(graph_path.stem.split(".")[0]) ggt.gp.time = ggt.new_gp("int32_t", val=time) save_path = graph_path.parent.joinpath("../graphs_with_metrics") if not os.path.isdir(save_path): os.makedirs(save_path) num_edges = ggt.num_edges() num_nodes = ggt.num_vertices() for m in metrics: get_metric(ggt, m, num_nodes, num_edges) ggt.save(str(save_path.joinpath("{}.gt.xz".format(time))))
def prepare_conceptnet( graph_path: Union[str, Path]) -> Tuple[Graph, Dict[str, gt.Vertex]]: logger.info(f"Load conceptnet graph - {str(graph_path)}") conceptnet_graph = gt.load_graph(str(graph_path)) logger.info(f"Loaded conceptnet graph - {str(graph_path)}") remove_self_loops(conceptnet_graph) conceptnet_graph.reindex_edges() logger.info(f"Generate aspect name to vertex mapping - {str(graph_path)}") vertices_conceptnet = dict( zip( conceptnet_graph.vertex_properties["aspect_name"], conceptnet_graph.vertices(), )) return Graph(conceptnet_graph), vertices_conceptnet
def load_graph_by_name(name, weighted=False, suffix=''): suffix = suffix.strip() if name == 'lattice': shape = (10, 10) g = lattice(shape) else: if weighted: path = 'data/{}/graph_weighted{}.gt'.format(name, suffix) else: path = 'data/{}/graph{}.gt'.format(name, suffix) print('load graph from {}'.format(path)) g = load_graph(path) # assert not g.is_directed() return remove_filters(g) # add shell
def load_wiki(): import graph_tool as gt import real_world as rw graph_file = 'wiki_simple.gt' ds_file = 'wiki_dst.npy' k = gt.load_graph(graph_file) dst_mat = np.load(ds_file) lcc = label_largest_component(k) k.set_vertex_filter(lcc) lcc_nodes = np.where(lcc.a)[0] rw.read_original_graph('soc-wiki.txt') cexp.redensify.G = rw.G cexp.redensify.N = len(rw.G) cexp.redensify.EDGES_SIGN = rw.EDGE_SIGN return k, lcc_nodes, dst_mat
def test_load_graph_from_stream(self, here): """Should load the graph from the test.graphml from bytes""" # get bytes from file filepath = os.path.join(here, 'data', 'test.graphml') fmt = filepath.split('.')[-1].strip() with open(filepath, 'rb') as f: graph_txt = f.read() # create stream from bytes and load graph stream = io.BytesIO(graph_txt) g = gt.load_graph(stream, fmt=fmt) # make sure there are vertices vertices = list(g.vertices()) assert len(vertices) > 200
def create_jsons_from_graphs(path, metrics): load_path = path save_path = path.parent.joinpath("metric_jsons") if not os.path.isdir(save_path): os.makedirs(save_path) graph_paths = sorted(load_path.glob("*.gt.xz"), key=lambda x: int(x.stem.split(".")[0])) print("Process graphs to create JSONs") for i in tqdm(range(len(graph_paths))): p = graph_paths[i] G = gt.load_graph(str(p)) labels = G.vp.label.get_2d_array([0]).tolist()[0] get_metrics(G, G.num_vertices(), G.num_edges(), int(p.stem.split(".")[0]), str(save_path), metrics, labels) return sorted(save_path.glob("*.json"), key=lambda x: int(x.stem))
def prepare_conceptnet_graph(graph_path: str, relation_types: Set[str]): g = gt.load_graph(graph_path) remove_self_loops(g) g.reindex_edges() # filter relations e_hierarchical_relation_filter = g.new_edge_property("bool") relations = list(g.properties[("e", "relation")]) for edge, edge_relation in tqdm(zip(g.edges(), relations), desc="Edge filtering...", total=len(relations)): e_hierarchical_relation_filter[edge] = edge_relation in relation_types g.set_edge_filter(e_hierarchical_relation_filter) vertices = dict(zip(g.vertex_properties["aspect_name"], g.vertices())) return g, vertices
def load(self): if self._mapfile[-3:] != 'shp': self.g = load_graph(self._mapfile) return try: sf = shapefile.Reader(self._mapfile) except Exception as e: print(str(e)) return False roads_records = sf.shapeRecords() # 获取路段信息' for road_record in roads_records: cross_s_index = self.add_cross(road_record.shape.points[0]) cross_e_index = self.add_cross(road_record.shape.points[-1]) self.add_road_edge(cross_s_index, cross_e_index, road_record) if int(road_record.record[self.DIRECTION_index]) == 0: # 若路段是双向车道 self.add_road_edge(cross_e_index, cross_s_index, road_record) return True
def load_graph_from_edgelist(dataset, options={}): """""" edgelist, graph_gt = dataset['path_edgelist'], dataset['path_graph_gt'] D = None # prefer graph_gt file if (not 'reconstruct_graph' in options or not options['reconstruct_graph']) and \ (graph_gt and os.path.isfile( graph_gt )): log.info('Constructing DiGraph from gt.xz') D = load_graph(graph_gt) elif edgelist and os.path.isfile(edgelist): log.info('Constructing DiGraph from edgelist') if 'dict_hashed' in options and options['dict_hashed']: D = load_graph_from_csv(edgelist, directed=True, hashed=False, skip_first=False, csv_options={ 'delimiter': ' ', 'quotechar': '"' }) else: D = load_graph_from_csv(edgelist, directed=True, hashed=True, skip_first=False, csv_options={ 'delimiter': ' ', 'quotechar': '"' }) # check if graph should be dumped dump_graph(D, edgelist, options) else: log.error( 'edgelist or graph_gt file to read graph from does not exist') return None return D
def work(num): g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") c = gt.centrality.closeness(g) fig = plt.figure() res = [] for i in c.a.tolist(): if not str(i) == "nan": res.append(i) plt.hist(res, label="Closeness", bins=100) plt.legend(loc="upper right") plt.xlabel("Closeness") plt.ylabel("Count") fig.savefig('../pic/closeness' + str(num) + '.png') print 'max of closeness: ' + str(np.max(res)) print 'max of closeness: ' + str(np.min(res)) print 'mean of closeness: ' + str(np.mean(res)) print 'var of closeness: ' + str(np.var(res))
def __init__(self, agent_generator, network_filename=None, largest_component=False, directed=False, vprop_node_id='nodeID', **kwargs): AgentsConnection.__init__(self, agent_generator, **kwargs) if network_filename is None: self.net = gt.Graph(directed=directed) else: try: self.print_f('load gt file', network_filename) self.net = gt.load_graph(network_filename) if self.net.is_directed() != directed: self.net.set_directed(directed) except: self.print_f('failed. fall back to read edge list') self.net = read_edge_list(network_filename, directed=directed) if largest_component: self.print_f('reduce network to largest component') lc = gt.topology.label_largest_component(self.net) self.net.set_vertex_filter(lc) self.net.purge_vertices() self.print_f('create agents') self.agent_to_vertex = dict() agents_pmap = self.net.new_vertex_property('object') node_id_pmap = self.net.new_vertex_property('int') try: node_ids = self.net.vp[vprop_node_id] except KeyError: self.print_f('No vertex property named:', vprop_node_id) self.print_f('Available vertex properties:', self.net.vp.keys()) self.print_f('Please use "vprop_node_id" param to specify the right one') exit() for agent_id, v in enumerate(self.net.vertices()): agent = self.agent_generator.generate_agent(node_ids[v]) self.agent_to_vertex[agent] = v agents_pmap[v] = agent node_id_pmap[v] = int(agent) self.net.vp["agents"] = agents_pmap self.net.vp["NodeId"] = node_id_pmap self.print_f('setup done')
def _library_load(filename, fmt): ''' Load the file using the library functions ''' if nngt.get_config("backend") == "networkx": import networkx as nx if fmt == "graphml": return nx.read_graphml(filename) else: raise NotImplementedError elif nngt.get_config("backend") == "igraph": import igraph as ig if fmt == "graphml": return ig.Graph.Read_GraphML(filename) else: raise NotImplementedError elif nngt.get_config("backend") == "graph-tool": import graph_tool as gt return gt.load_graph(filename, fmt=fmt) else: raise NotImplementedError
def run_minimize_blockmodel(mg, temp_loc): # save to temp nx.write_graphml(mg.g, temp_loc) # load into graph-tool from temp g = load_graph(temp_loc, fmt="graphml") total_degrees = g.get_total_degrees(g.get_vertices()) remove_verts = np.where(total_degrees == 0)[0] g.remove_vertex(remove_verts) min_state = minimize_blockmodel_dl(g, verbose=False) blocks = list(min_state.get_blocks()) verts = g.get_vertices() block_map = {} for v, b in zip(verts, blocks): cell_id = int(g.vertex_properties["_graphml_vertex_id"][v]) block_map[cell_id] = int(b) block_series = pd.Series(block_map) block_series.name = "block_label" return block_series
def work(num): g = gt.load_graph("../data/graph" + str(num) + ".xml.gz") indegree_list = gt.stats.vertex_hist(g, "in")[0] outdegree_list = gt.stats.vertex_hist(g, "out")[0] fig = plt.figure() plt.plot(indegree_list, label="Indegree distribution") plt.plot(outdegree_list, label="Outdegree distribution") plt.xlim(0, 40) plt.ylim(0, 100) plt.legend(loc="upper right") plt.xlabel("Degree") plt.ylabel("Count") fig.savefig("../pic/degree" + str(num) + ".png") print 'max of indegree : ' + str(np.max(indegree_list)) print 'mean of indegree : ' + str(np.mean(indegree_list)) print 'variance of indegree : ' + str(np.var(indegree_list)) print 'max of outdegree : ' + str(np.max(outdegree_list)) print 'mean of outdegree : ' + str(np.mean(outdegree_list)) print 'variance of outdegree : ' + str(np.var(outdegree_list))
def get_graph(balanced=False): """Load the graph from BASENAME and optionally remove positive edges to balance the graph. NOTE: this only modify redensify structure and not graph_tool & its distance matrix""" if balanced: import persistent if os.path.isfile(BASENAME+'.gt'): g = graph_tool.load_graph(BASENAME+'.gt') dst_mat = np.load(BASENAME+'_dst.npy') cexp.to_python_graph(g) if balanced: to_delete = persistent.load_var(BASENAME+'_balance.my') for edge in to_delete: pot.delete_edge(redensify.G, edge, redensify.EDGES_SIGN) return g, dst_mat if not PA: cexp.random_signed_communities(2, 500, 13, 11.5/500, .0, .0) g = cexp.to_graph_tool() else: cexp.preferential_attachment(1000, gamma=1.4, m=12) cexp.turn_into_signed_graph_by_propagation(2) DEGREES = sorted(((node, len(adj)) for node, adj in cexp.redensify.G.items()), key=lambda x: x[1]) u, v = DEGREES[-1][0], DEGREES[-2][0] u, v = v, u if u > v else u, v del cexp.redensify.EDGES_SIGN[(u, v)] cexp.redensify.G[u].remove(v) cexp.redensify.G[v].remove(u) n = g.num_vertices() dst = shortest_distance(g, dense=False) dst_mat = np.zeros((n, n), dtype=np.uint8) for v in g.vertices(): dst_mat[int(v), :] = dst[v].a.astype(np.uint8) g.save(BASENAME+'.gt') np.save(BASENAME+'_dst', dst_mat)
# Add the edge and increase the progress counter net.add_edge(tail, head) # Show an update every nth node count += 1 if count % 10000 == 0: print(count) print('finished reading ' + infile) # Write it to disk so we don't have to do this again later net.save(outfile) print('finished saving ' + outfile) else: print('found saved ' + outfile) # Read the saved network net = gt.load_graph(outfile) # Since vertices as such can't be pickled, we need to reconstruct # id and id_to_gt manually print('finished reading ' + outfile) id = net.vertex_properties['id'] id_to_gt = {id[vertex]: vertex for vertex in net.vertices()} #print(len(id_to_gt)) print('total vertices: ' + str(net.num_vertices())) print('total edges: ' + str(net.num_edges())) # # # How many samples to collect? # n_samples = 1000 # # Initialize a container for them # samples = [] # # And set a seed
def load_data(target_output): patient_file = TCGA_root_dir + '/Clinical/Biotab/nationwidechildrens.org_clinical_patient_brca.txt' betas_file = TCGA_root_dir + '/betas.npz' processed_betas_file = TCGA_root_dir + '/betas-processed.npz' graph_dump_file = TCGA_root_dir + '/graph.xml.gz' if (os.path.isfile(betas_file)): data_file = np.load(betas_file) betas = data_file['betas'] col_names = data_file['col_names'] patient_data = data_file['patient_data'] sample_names = data_file['sample_names'] print('fount betas_file, shape: %s' % (betas.shape.__str__())) else: patient_skipped_lines = 3 patient_data = np.array(read_csv(patient_file, skip_header = False)) patient_data = patient_data[patient_skipped_lines:,] sample_names = patient_data[:,0] data_dir = TCGA_root_dir + '/DNA_Methylation/JHU_USC__HumanMethylation450/Level_3/' files = os.listdir(data_dir) col_names = np.empty(0) used_samples = np.empty(0) unused_samples = np.empty(0) multiple_data_samples = np.empty(0) i = 0 for name in sample_names: i += 1 print('processing %3d/%3d ' %(i, len(sample_names)) + name) # 01 is the primary tumor sample, A is the vial A matched = [f for f in files if f.find(name+'-01A') > -1] if (len(matched) > 1): multiple_data_samples = np.append(multiple_data_samples, name) continue elif len(matched) == 0: print('no files found.') unused_samples = np.append(unused_samples, name) continue used_samples = np.append(used_samples, name) matched = matched[0] sample_data = np.array(read_csv(data_dir + matched, skip_header = False)) data_skipped_lines = 2 sample_col_names = sample_data[data_skipped_lines:,0] if col_names.shape[0] == 0: col_names = sample_col_names betas = np.empty((0,sample_col_names.shape[0]), dtype=float) else: if all(col_names == sample_col_names) == False: raise RuntimeError("column names don't match") v = sample_data[data_skipped_lines:, 1] v[v == 'NA'] = -1 v = np.array(v, dtype=float) v[v == -1] = np.nan betas = np.vstack((betas, v.reshape(1,-1))) indices = np.array([i for i in range(betas.shape[1]) if not any(np.isnan(betas[:,i]))]) betas = betas[:,indices] col_names = col_names[indices] sample_indices = np.array([list(sample_names).index(used_samples[i]) for i in range(len(used_samples))]) patient_data = patient_data[sample_indices,:] np.savez(open(betas_file, 'wb'), betas = betas, col_names = col_names, patient_data = patient_data, sample_names = sample_names) if (os.path.isfile(processed_betas_file) and os.path.isfile(graph_dump_file)): g = gt.load_graph(graph_dump_file) data_file = np.load(processed_betas_file) X = data_file['X'] genes = data_file['genes'] patient_data = patient_data print('processed data found, X: %s' % (X.shape.__str__())) else: X, g, genes = networkize_illumina450k(betas, col_names) print (X.__class__) print (genes.__class__) print (patient_data.__class__) print (g.__class__) np.savez(open(processed_betas_file, 'wb'), X = X, genes=genes,patient_data=patient_data) g.save(graph_dump_file) if (target_output == 'ER'): # ER status is column index 43 labels = patient_data[:,43] y = np.zeros(len(patient_data), dtype=int) y[labels == 'Negative'] = -1 y[labels == 'Positive'] = 1 final_sample_indices = (y != 0) y = y[final_sample_indices] X = X[final_sample_indices,] patient_data = patient_data[final_sample_indices,:] elif (target_output == 'stage'): # Stage status is column 40 labels = patient_data[:,40] y = np.zeros(len(patient_data), dtype=int) y[labels == 'Stage I'] = -1 y[labels == 'Stage IA'] = -1 y[labels == 'Stage IB'] = -1 y[labels == 'Stage II'] = -1 y[labels == 'Stage IIA'] = -1 y[labels == 'Stage IIB'] = -1 y[labels == 'Stage III'] = 1 y[labels == 'Stage IIIA'] = 1 y[labels == 'Stage IIIB'] = 1 final_sample_indices = (y != 0) y = y[final_sample_indices] X = X[final_sample_indices,] patient_data = patient_data[final_sample_indices,:] elif (target_output == 'T'): # T (tumor size) status is column 37 labels = patient_data[:,37] y = np.zeros(len(patient_data), dtype=int) y[labels == 'T1'] = -1 y[labels == 'T1a'] = -1 y[labels == 'T1b'] = -1 y[labels == 'T1c'] = -1 y[labels == 'T2'] = -1 y[labels == 'T2a'] = -1 y[labels == 'T2b'] = -1 y[labels == 'T3'] = 1 y[labels == 'T3a'] = 1 y[labels == 'T3b'] = 1 y[labels == 'T4'] = 1 y[labels == 'T4a'] = 1 y[labels == 'T4b'] = 1 y[labels == 'T4c'] = 1 y[labels == 'T4d'] = 1 final_sample_indices = (y != 0) y = y[final_sample_indices] X = X[final_sample_indices,] patient_data = patient_data[final_sample_indices,:] elif (target_output == 'N'): # N (lymph nodes) status is column 38 labels = patient_data[:,38] y = np.zeros(len(patient_data), dtype=int) N0s = np.array([labels[i].startswith("N0") for i in range(len(labels))]) y[N0s] = -1 N1s = np.array([labels[i].startswith("N1") for i in range(len(labels))]) y[N1s] = 1 N2s = np.array([labels[i].startswith("N2") for i in range(len(labels))]) y[N1s] = 1 N3s = np.array([labels[i].startswith("N3") for i in range(len(labels))]) y[N1s] = 1 final_sample_indices = (y != 0) y = y[final_sample_indices] X = X[final_sample_indices,] patient_data = patient_data[final_sample_indices,:] else: raise RuntimeError("target_output not in ('ER', 'T', 'N', 'stage')") return (X, y, g, patient_data, genes)
import random as r import args_experiments as ae import convert_experiment as cexp import real_world as rw import redensify parser = ae.get_parser('Compute a galaxy tree') args = parser.parse_args() a = ae.further_parsing(args) basename, seeds, synthetic_data, prefix, noise, balanced = a if synthetic_data: try: ae.load_raw(basename, redensify, args) except IOError: import graph_tool as gt g = gt.load_graph(basename+'.gt') cexp.to_python_graph(g) else: rw.read_original_graph(basename, seed=args.seed, balanced=balanced) redensify.G = deepcopy(rw.G) redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN) suffixes = ('_bal' if args.balanced else '', '_short' if args.short else '', '_safe' if args.safe else '', args.seed) outname = 'lp10/{}{}{}{}_{}'.format(args.data.lower(), *suffixes) print(outname) res = meta_galaxy(redensify.G, redensify.EDGES_SIGN, 10, outname, safe=args.safe, short=args.short) if args.safe: gold, pred, _ = res
def loadGraph(self, path): G = graph_tool.load_graph(path, fmt="gml") return G
def main(argv=None): # parse arguments (not used yet) if argv is None: argv = sys.argv # load parameters f = open(paramFn, 'r') my_params = pickle.load(f) f.close() # load graph topology g = gt.load_graph(graphFn) g.reindex_edges() num_vertices = g.num_vertices() num_edges = g.num_edges() # store vertex types vertex_types = np.array( g.vertex_properties["type"].get_array(), dtype=np.int ) # construct an edge list edge_list = np.zeros( (num_edges, 3) ) # also a lookup table for in-edges # this requires a degree list in_degrees = np.array( g.degree_property_map("in").get_array(), dtype=np.int ) max_degree = np.max( in_degrees ) if num_edges > 0: # "ragged" array of in-edges in_edges = np.zeros( (num_vertices, max_degree), dtype=np.int ) gsyn_props = g.edge_properties["gsyn"] else: in_edges = np.zeros( (num_vertices, max_degree), dtype=np.int ) gsyn_props = [] # for looping in_edge_ct = np.zeros( (num_vertices,), dtype=np.int ) i = 0 for e in g.edges(): source_index = int( e.source() ) target_index = int( e.target() ) edge_list[i,...] = [source_index, target_index, gsyn_props[e]] in_edges[ target_index, in_edge_ct[target_index] ] = i # increment indices in_edge_ct[ target_index ] += 1 i += 1 ## setup initial conditions # state will contain vertex variables & edge # variables in a 1d array N = num_vertices*num_eqns_per_vertex +\ num_edges*num_eqns_per_edge # state vector y encodes vertex and edge data y = np.zeros(N) for i in range( num_vertices ): # vertex data in 0:num_eqns_per_vertex*num_vertices-1 j = range(i*num_eqns_per_vertex, (i+1)*num_eqns_per_vertex) #print(j) y[j] = [ -0.026185387764343, 0.318012107836673, 0.760361103277830, 0.681987892188221, 0.025686471226045, 0.050058183820371, 4.998888741335261 ] offset = num_vertices*num_eqns_per_vertex for i in range( num_edges ): j = range(offset + i*num_eqns_per_edge, offset + (i+1)*num_eqns_per_edge) #print(j) y[j] = 0.000001090946631 #print(N) print y # f is the rhs with parameters evaluated def f(t, y): dydt = prebotc.rhs(t, y, vertex_types, edge_list, in_edge_ct, in_edges, my_params) return dydt # output vector of states save_state = np.zeros( (N, Nstep) ) ## hard-coded Euler method t = t0; for i in range(Nstep): dydt = f(t, y) y = y + dydt * dt # fwd Euler #save_state[:, i] = y[ 0:(num_vertices*num_eqns_per_vertex):num_eqns_per_vertex ] # just voltages save_state[:, i] = y; # all vars t = t + dt; if ( (i+1)%report_every ) == 0: print t scipy.io.savemat(outFn, mdict={'Y': save_state}, oned_as = 'col')
def poisson_disk_sample(left_hemisphere, right_hemisphere, num_regions, wmm=None, output_filename='pds_results.pickle', create_niftii=True, output_basename='randomLabels-'): # if left and right hemisphere is given as a string (filename), load if ((not isinstance(left_hemisphere, str)) or (not isinstance(right_hemisphere, str))): raise TypeError('Input error: poisson_disk_sampling(...) expects the first two arguments to be filenames.') # check if num regions is a list if (not isinstance(num_regions, list)): raise TypeError('Input error: poisson_disk_sampling(...) expects number of regions to be a list.') # ensure output_filename ends with pickle if ((not (output_filename[-7::]=='.pickle'))|(not isinstance(output_filename, str))): raise TypeError('Input error: poisson_disksampling(...) expects output name (string) ending in ".pickle".') for which, each in enumerate(num_regions): if ((each%2)!=0): print('Input error: Number of regions uneven. Fixing it by subtracting 1...') num_regions[which] = each-1 # set graph file names graph_file_left = 'graph_skel_left.xml.gz' graph_file_right = 'graph_skel_right.xml.gz' # prepare imaging data for generating voxel level graphs print('Preparing data...') imgData_left, matrix_coords_left = misc.prepare_data(left_hemisphere,wmm=wmm) imgData_right, matrix_coords_right = misc.prepare_data(right_hemisphere,wmm=wmm) # if graph files do not exist, calculate and save, else load if ((not os.path.isfile(graph_file_left)) & (not os.path.isfile(graph_file_right))): # left hemisphere graph_left = generate_graph(matrix_coords_left, imgData_left.copy()) graph_left.save(graph_file_left) #right hemisphere graph_right = generate_graph(matrix_coords_right, imgData_right.copy()) graph_right.save(graph_file_right) else: graph_left = gt.load_graph(graph_file_left) graph_right = gt.load_graph(graph_file_right) print('Finding region centres...') if os.path.isfile(output_filename): with open(output_filename,'rb') as outputfile: [regions, thresholds_left, thresholds_right, ignore] = pickle.load(outputfile) else: regions = [] thresholds_left = [] thresholds_right = [] # for each entry in the num_regions list for idx, each in enumerate(num_regions): print('Parcellating for %04d regions' %each) # find parcellations in both hemispheres thresh_left, regions_left = parcellate_hemisphere(graph_left, matrix_coords_left, int(each/2.)) # right hemisphere takes in distance estimate from left hemisphere, as a starting point thresh_right, regions_right = parcellate_hemisphere(graph_right, matrix_coords_right, int(each/2.), thresh_left) # restructure to save regions.append([regions_left[0], regions_right[0]]) thresholds_left.append(thresh_left) thresholds_right.append(thresh_right) print('Saving progress...') # save results to file with open(output_filename,'wb') as outputfile: pickle.dump([regions, thresholds_left, thresholds_right, num_regions], outputfile) if create_niftii: print('Creating niftii files...') create_parcellation(output_filename, [each], left_hemisphere, right_hemisphere, output_basename)
def load_graphtool(path, fmt): gt = _import_graphtool() graph = gt.load_graph(path, fmt=fmt) return cls.from_graphtool(graph)
def g(): return load_graph('data/{}/2-6/graph.gt'.format('grid'))
def load_data(input_dir, target_labels, sample_type=None, patient_annot_file=None, final_dump_folder = None, networkize_data = False): if (sample_type == None): print("sample type must be given. For example 01A (as suffix to patient codes.)") return dump_dir = input_dir + '/processed' if (not os.path.exists(dump_dir)): os.mkdir(dump_dir) if (patient_annot_file == None): patient_file_candidates = glob.glob(input_dir + '/Clinical/Biotab/nationwidechildrens.org_clinical_patient*.txt') if (len(patient_file_candidates) != 1): print('ERROR: patient_file_candidates: ', patient_file_candidates) return(None) patient_annot_file = patient_file_candidates[0] patient_annot_processed_file = dump_dir + '/patient_annot.npz' betas_file = dump_dir + '/betas.npz' processed_betas_file = dump_dir + '/betas-processed.npz' gene_annot_file = dump_dir + '/genes.npz' graph_dump_file = dump_dir + '/graph.xml.gz' calculated_L_matrix = dump_dir + '/L.npz' ''' here we load the annotation and batch information of the samples ''' if (os.path.isfile(patient_annot_processed_file)): data_file = np.load(patient_annot_processed_file) patient_annot = data_file['patient_annot'] patient_annot_colnames = data_file['patient_annot_colnames'] patient_codes = data_file['patient_codes'] else: patient_skipped_lines = 3 patient_data = np.array(read_csv(patient_annot_file, skip_header = False)) patient_annot_colnames = patient_data[0,:] patient_annot = patient_data[patient_skipped_lines:,] patient_codes = patient_data[patient_skipped_lines:,0] xml_dir = input_dir + '/Clinical/XML' ''' here I look for the admin:batch_number key in xml files of the patients, extract that line, remove extra stuff with sed, and get a two column text with patient ids and batch numbers. ''' output = subprocess.check_output("grep \"admin:batch_number xsd_ver=\" %s/*_clinical*.xml | awk '{print $1 \"\t\" $3}' | sed \"s/.*clinical\.//g\" | sed \"s/\.xml:\t.*\\\">/\t/g\" | sed \"s/\..*//g\"" % (xml_dir), shell=True, universal_newlines=True).splitlines() patient_batches_dict = {output[i].split('\t')[0]:output[i].split('\t')[1] for i in range(len(output))} patient_batches = np.zeros(len(patient_codes), dtype=int) for i in range(len(patient_codes)): patient_batches[i] = patient_batches_dict[patient_codes[i]] patient_annot = np.hstack((patient_annot, patient_batches.reshape(-1,1))) patient_annot_colnames = np.append(patient_annot_colnames, 'batch_number') np.savez(open(patient_annot_processed_file, 'wb'), patient_annot = patient_annot, patient_annot_colnames = patient_annot_colnames, patient_codes = patient_codes) ''' in this section the methylation beta values are extracted and put into a matrix loaded from 450k illumina chip. ''' if (os.path.isfile(betas_file)): data_file = np.load(betas_file) betas = data_file['betas'] col_names = data_file['col_names'] sample_indices = data_file['methylation_45k_sample_indices'] print('fount betas_file, shape: %s' % (betas.shape.__str__())) else: data_dir = input_dir + '/DNA_Methylation/JHU_USC__HumanMethylation450/Level_3/' if (os.path.exists(data_dir)): sample_indices, col_names, betas, debug_info = \ load_450k_methylation(data_dir, patient_codes, sample_type) print(debug_info) np.savez(open(betas_file, 'wb'), betas = betas, col_names = col_names, methylation_45k_sample_indices = sample_indices) """ Don't use the PPI network if no network is needed, and return raw beta values. """ if not networkize_data: processed_data = dump_by_target_label(betas, target_labels, patient_annot, patient_annot_colnames, sample_indices, None, dump_dir) return (processed_data, None, col_names) ''' use the graph to map nodes to genes and get the graph itself. ''' if (os.path.isfile(processed_betas_file) and os.path.isfile(graph_dump_file) and os.path.isfile(gene_annot_file)): g = gt.load_graph(graph_dump_file) data_file = np.load(processed_betas_file) X = data_file['X'] data_file = np.load(gene_annot_file) genes = data_file['genes'] print('processed data found, X: %s' % (X.shape.__str__())) else: X, g, genes = networkize_illumina450k(betas, col_names) print (X.__class__) print (genes.__class__) print (g.__class__) np.savez(open(processed_betas_file, 'wb'), X = X) np.savez(open(gene_annot_file, 'wb'), genes=genes) g.save(graph_dump_file) if (os.path.isfile(calculated_L_matrix)): data_file = np.load(calculated_L_matrix) L = data_file['L'] print('fount L matrix, shape: %s' % (L.shape.__str__())) else: print("calculating L and transformation of the data...") B = gt.spectral.laplacian(g) M = np.identity(B.shape[0]) + Globals.beta * B M_inv = np.linalg.inv(M) L = np.linalg.cholesky(M_inv) np.savez(open(calculated_L_matrix, 'wb'), L = L) if (final_dump_folder != None): dump_dir = final_dump_folder processed_data = dump_by_target_label(X, target_labels, patient_annot, patient_annot_colnames, sample_indices, L, dump_dir) return (processed_data, g, genes)
if (sys.argv[i] == '--cv-index'): cv_index = int(sys.argv[i + 1]) - 1 if (sys.argv[i] == '--regularizer-index'): regularizer_index = int(sys.argv[i + 1]) print(working_dir, method, cv_index, regularizer_index, file=sys.stderr) print("loading data...", file=sys.stderr) data_file = np.load(working_dir + '/npdata.npz') tmpX = data_file['tmpX'] X_prime = data_file['X_prime'] y = data_file['y'] sample_annotation = data_file['sample_annotation'] feature_annotation = data_file['feature_annotation'] g = gt.load_graph(working_dir + '/graph.xml.gz') cvs = pickle.load(open(working_dir + '/cvs.dmp', 'rb')) #choosing only one cross-validation fold tmp = list() tmp.append((cvs[cv_index])) cvs = tmp cpu_count = 1 max_learner_count = 3 rat_scores = dict() all_scores = defaultdict(list) if (method == 'others'): machine = svm.NuSVC(nu=0.25,
import graph_tool as gt import graph_tool.community as gtcomm #import graph_tool.draw as gtdraw import graph_tool.topology as gtopo from matplotlib.cm import OrRd_r, OrRd from analyze_net import layout_and_plot net = gt.load_graph('autnet0.out.gt') core = net.vp['core'] core_vertices = [vertex for vertex in net.vertices() if core[vertex]] print('total v: ' + str(net.num_vertices())) print('total core: ' + str(len([vertex for vertex in net.vertices() if core[vertex]]))) # For citenet # cutoff = 2013 # cutoff_pmap = net.new_vp('bool', vals = [net.vp['year'][vertex] > cutoff for vertex in net.vertices()]) # net.set_vertex_filter(cutoff_pmap) # core_vertices = [vertex for vertex in net.vertices() if core[vertex]] # print(cutoff) # For autnets cutoff = 0 cutoff_pmap = core.copy() for i in range(cutoff): gt.infect_vertex_property(net, cutoff_pmap, vals = [True]) net.set_vertex_filter(cutoff_pmap) core_vertices = [vertex for vertex in net.vertices() if core[vertex]] print('cutoff v: ' + str(net.num_vertices()))
import pickle import numpy as np import core.FCE import core.raccoon import graph_tool as gt if __name__ == '__main__': input_dir = '../data/TCGA-SARC/vital_status' data_file = np.load(input_dir + '/data.npz') X = data_file['X'] X_prime = data_file['X_prime'] y = data_file['y'] sample_annotation = data_file['patient_annot'] data_file = np.load(input_dir + '/../genes.npz') feature_annotation = data_file['genes'] g = gt.load_graph(input_dir + '/../graph.xml.gz') cvs = pickle.load(open(input_dir + '/batch_cvs.dmp', 'rb')) # choosing only one cross-validation fold cvs_results = list() for cv_index in range(len(cvs)): #for cv_index in [3]: tmp = list() tmp.append((cvs[cv_index])) tcvs = tmp Xtrain = X[tcvs[0][0], ] ytrain = y[tcvs[0][0], ] Xtest = X[tcvs[0][1], ] ytest = y[tcvs[0][1], ]
def open_and_apply_filters(filename): G = gt.load_graph(filename) G.set_vertex_filter(G.vertex_properties['in_USPTO_tree']) return G
def load_data(target_output): patient_file = TCGA_root_dir + '/Clinical/Biotab/nationwidechildrens.org_clinical_patient_laml.txt' expressions_file = TCGA_root_dir + '/expressions.npz' processed_expressions_file = TCGA_root_dir + '/expressions-processed.npz' graph_dump_file = TCGA_root_dir + '/graph-geneexpression.xml.gz' if (os.path.isfile(expressions_file)): data_file = np.load(expressions_file) expressions = data_file['expressions'] col_names = data_file['col_names'] patient_data = data_file['patient_data'] sample_names = data_file['sample_names'] print('fount expressions_file, shape: %s' % (expressions.shape.__str__())) else: patient_skipped_lines = 3 patient_data = np.array(read_csv(patient_file, skip_header = False)) patient_data = patient_data[patient_skipped_lines:,] sample_names = patient_data[:,0] data_dir = TCGA_root_dir + '/Expression-Genes/WUSM__HG-U133_Plus_2/Level_3/' files = os.listdir(data_dir) col_names = np.empty(0) used_samples = np.empty(0) unused_samples = np.empty(0) multiple_data_samples = np.empty(0) i = 0 for name in sample_names: i += 1 print('processing %3d/%3d ' %(i, len(sample_names)) + name) # 03A : Primary Blood Derived Cancer - Peripheral Blood matched = [f for f in files if f.find(name+'-03A') > -1] if (len(matched) > 1): multiple_data_samples = np.append(multiple_data_samples, name) continue elif len(matched) == 0: print('no files found.') unused_samples = np.append(unused_samples, name) continue used_samples = np.append(used_samples, name) matched = matched[0] sample_data = np.array(read_csv(data_dir + matched, skip_header = False)) data_skipped_lines = 2 sample_col_names = sample_data[data_skipped_lines:,0] if col_names.shape[0] == 0: col_names = sample_col_names expressions = np.empty((0,sample_col_names.shape[0]), dtype=float) else: if all(col_names == sample_col_names) == False: raise RuntimeError("column names don't match") v = sample_data[data_skipped_lines:, 1] v[v == 'NA'] = -1 v = np.array(v, dtype=float) v[v == -1] = np.nan expressions = np.vstack((expressions, v.reshape(1,-1))) indices = np.array([i for i in range(expressions.shape[1]) if not any(np.isnan(expressions[:,i]))]) expressions = expressions[:,indices] col_names = col_names[indices] sample_indices = np.array([list(sample_names).index(used_samples[i]) for i in range(len(used_samples))]) patient_data = patient_data[sample_indices,:] np.savez(open(expressions_file, 'wb'), expressions = expressions, col_names = col_names, patient_data = patient_data, sample_names = sample_names) if (os.path.isfile(processed_expressions_file) and os.path.isfile(graph_dump_file)): g = gt.load_graph(graph_dump_file) data_file = np.load(processed_expressions_file) X = data_file['X'] genes = data_file['genes'] patient_data = patient_data print('processed data found, X: %s' % (X.shape.__str__())) else: X, g, genes = networkize_illuminaU133(expressions, col_names) print (X.__class__) print (genes.__class__) print (patient_data.__class__) print (g.__class__) np.savez(open(processed_expressions_file, 'wb'), X = X, genes=genes,patient_data=patient_data) g.save(graph_dump_file) if (target_output == 'risk_group'): # cyto_risk_group status is column index 50 labels = patient_data[:,50] y = np.zeros(len(patient_data), dtype=int) y[labels == 'Favorable'] = -1 y[labels == 'Intermediate/Normal'] = 1 y[labels == 'Poor'] = 1 final_sample_indices = (y != 0) y = y[final_sample_indices] X = X[final_sample_indices,] patient_data = patient_data[final_sample_indices,:] elif (target_output == 'vital_status'): # vital_status status is column index 15 labels = patient_data[:,15] y = np.zeros(len(patient_data), dtype=int) y[labels == 'Alive'] = -1 y[labels == 'Dead'] = 1 final_sample_indices = (y != 0) y = y[final_sample_indices] X = X[final_sample_indices,] patient_data = patient_data[final_sample_indices,:] else: raise RuntimeError("target_output not in ('risk_group', 'vital_status')") return (X, y, g, patient_data, genes)
ds_file = 'epi_graph_dst.npy' orig_file = 'soc-sign-epinions.txt' prefix = 'epi' size = 131580 n = size idx = int(sys.argv[1]) def print_diag(msg): global start, idx info = '{}{:.2f} seconds\n'.format with open('{}_out.{}'.format(prefix, idx), 'a') as f: f.write(info(msg.ljust(60), clock() - start)) start = clock() k = gt.load_graph(graph_file) dst_mat = np.load(ds_file) lcc = label_largest_component(k) k.set_vertex_filter(lcc) lcc_nodes = np.where(lcc.a)[0] slcc = set(lcc_nodes) all_lcc_edges = {(int(u), int(v)) for u, v in k.edges() if int(u) in slcc} rw.read_original_graph(orig_file) high_degree = [_[0] for _ in rw.DEGREES[-200:][::-1]] for e, s in rw.EDGE_SIGN.items(): rw.EDGE_SIGN[e] = 1 if s else -1 print_diag('load graph') root = high_degree[idx] bfs_tree = set(pot.get_bfs_tree(rw.G, root)) test_edges = all_lcc_edges - bfs_tree test_graph = {}
import graph_tool as gt import sys # Specify the file to read from command line if len(sys.argv) == 2 : filename = sys.argv[1] else : print " Usage : python reader.py graph.gml" exit(0) # let's load the graph and store the vertex properties in variable name. G = gt.load_graph(filename, "xml") names = G.vertex_properties["name"] # iterate over vertices of graph for i in G.vertices() : if names[i] == "&" : print " + and Found" elif names[i] == "|" : print " + or found " elif names[i] == "!" : print " + not found " else : print " + Input node found with name {0}".format(names[i])
attributes=EdgeAttributes) EdgeAttributes = {} for Edge,Value in networkx.get_edge_attributes(G=CompleteGraph, name='TraitGeneEdgeValue').iteritems(): try: EdgeAttributes[Edge] = float(Value) except: EdgeAttributes[Edge] = -999.0 networkx.set_edge_attributes(G=CompleteGraph, name='TraitGeneEdgeValue', attributes=EdgeAttributes) networkx.write_graphml(G=CompleteGraph, path='Data/CompleteGraphTypeCasted.graphml', encoding='utf-8', prettyprint=True) NodeDeleteList = [] for Node in CompleteGraph.nodes(): if((CompleteGraph.node[Node]['NodeInENGAGEMA']==0) and (CompleteGraph.node[Node]['NodeInENGAGE']==0)): NodeDeleteList.append(Node) for Node in NodeDeleteList: CompleteGraph.remove_node(Node) networkx.write_graphml(G=CompleteGraph, path='Data/InENGAGEGraph.graphml', encoding='utf-8', prettyprint=True) InENGAGEMAGraph = graph_tool.load_graph(file_name='Data/InENGAGEGraph.graphml',fmt='xml') print InENGAGEMAGraph
def create_parcellation(input_filename, num_regions, left_hemisphere, right_hemisphere, output_basename='randomLabels-'): # load data with open(input_filename,'rb') as input_file: [regions, thresholds_left, thresholds_right, ignore] = pickle.load(input_file) # find number of regions that exist num_regions_list = [] for each_entry in regions: if ((each_entry[0] is None) or (each_entry[1] is None)): num_regions_list.append(0) else: num_regions_list.append(len(each_entry[0])+len(each_entry[1])) # prepare imaging data for generating voxel level graphs imgData_left, matrix_coords_left = misc.prepare_data(left_hemisphere) imgData_right, matrix_coords_right = misc.prepare_data(right_hemisphere) # define graph file names graph_file_left = 'graph_voxel_left.xml.gz' graph_file_right = 'graph_voxel_right.xml.gz' # if graph files do not exist, calculate and save, else load if ((not os.path.isfile(graph_file_left)) & (not os.path.isfile(graph_file_right))): print('- Creating voxel-level graphs...') graph_left = generate_graph(matrix_coords_left, imgData_left.copy()) graph_left.save(graph_file_left) graph_right = generate_graph(matrix_coords_right, imgData_right.copy()) graph_right.save(graph_file_right) else: graph_left = gt.load_graph(graph_file_left) graph_right = gt.load_graph(graph_file_right) for number_of_regions in np.unique(num_regions): # find index for the number of regions idx_list = np.where(np.asarray(num_regions_list)==number_of_regions)[0] # check if this number of regions exist and assign indices to populate hemispheres if (len(idx_list) == 0): print('Number of regions not found. Try filling the gaps or choose different number of regions.') else: for count, idx in enumerate(idx_list): img = nib.load(left_hemisphere) print('- Populating image...') ## populate image file_template = output_basename + '%03d' % number_of_regions + '_%03d' % (count+1) + '.nii.gz' # left hemisphere left_random_label = populate_mask(graph_left, regions[idx][0], imgData_left.copy(), matrix_coords_left) outimg = nib.Nifti1Image(left_random_label, header=img.get_header(), affine=img.get_affine()) outimg.to_filename('left_' + file_template) # right hemisphere right_random_label = populate_mask(graph_right, regions[idx][1], imgData_right.copy(), matrix_coords_right, offset=left_random_label.max()) outimg = nib.Nifti1Image(right_random_label, header=img.get_header(), affine=img.get_affine()) outimg.to_filename('right_' + file_template) # combined image random_label = left_random_label + right_random_label outimg = nib.Nifti1Image(random_label, header=img.get_header(), affine=img.get_affine()) outimg.to_filename(file_template)