def SimulateSbm(sbm_data, num_vertices, num_edges, pi, prop_mat, out_degs=None): """Generates a stochastic block model, storing data in sbm_data.graph. This function uses graph_tool.generate_sbm. Refer to that documentation for more information on the model and parameters. Args: sbm_data: StochasticBlockModel dataclass to store result data. num_vertices: (int) number of nodes in the graph. num_edges: (int) expected number of edges in the graph. pi: interable of non-zero community size proportions. Must sum to 1.0. prop_mat: square, symmetric matrix of community edge count rates. out_degs: Out-degree propensity for each node. If not provided, a constant value will be used. Note that the values will be normalized inside each group, if they are not already so. Returns: (none) """ if np.sum(pi) != 1.0: raise ValueError("entries of pi must sum to 1.0") if prop_mat.shape[0] != len(pi) or prop_mat.shape[1] != len(pi): raise ValueError("prop_mat must be k x k where k = len(pi)") sbm_data.graph_memberships = _GenerateNodeMemberships(num_vertices, pi) edge_counts = _ComputeExpectedEdgeCounts(num_edges, num_vertices, pi, prop_mat) sbm_data.graph = generation.generate_sbm(sbm_data.graph_memberships, edge_counts, out_degs) stats.remove_self_loops(sbm_data.graph) stats.remove_parallel_edges(sbm_data.graph) sbm_data.graph.reindex_edges()
def load_GT_graph(graphExample, gcc=False, removeSL=False, removePE=False): ''' Input: - graphExample, graph example from Graph-tool collections (e.g., 'cond-mat-2003', 'adjnoun' 'karate' 'netscience') or a graphfile in .gml format - gcc = True if only the giant connected component should be returned - removeSL = True if any self-loops must be removed - removePE = True if any parallel-edge must be removed Output: the corresponding graph_tool graph object ''' if graphExample[-4:] == ".gml": g = load_graph(graphExample) else: g = collection.data[graphExample] if g.is_directed: g.set_directed(False) # g = Graph(g, directed=False) if removePE: gtStats.remove_parallel_edges(g) if removeSL: gtStats.remove_self_loops(g) if gcc: l = topology.label_largest_component( g) #Keep Largest Connected Component g.set_vertex_filter(l) #g = GraphView(g, vfilt=l) g.purge_vertices() return g
def prepare_conceptnet( graph_path: Union[str, Path]) -> Tuple[Graph, Dict[str, gt.Vertex]]: logger.info(f"Load conceptnet graph - {str(graph_path)}") conceptnet_graph = gt.load_graph(str(graph_path)) logger.info(f"Loaded conceptnet graph - {str(graph_path)}") remove_self_loops(conceptnet_graph) conceptnet_graph.reindex_edges() logger.info(f"Generate aspect name to vertex mapping - {str(graph_path)}") vertices_conceptnet = dict( zip( conceptnet_graph.vertex_properties["aspect_name"], conceptnet_graph.vertices(), )) return Graph(conceptnet_graph), vertices_conceptnet
def prepare_conceptnet_graph(graph_path: str, relation_types: Set[str]): g = gt.load_graph(graph_path) remove_self_loops(g) g.reindex_edges() # filter relations e_hierarchical_relation_filter = g.new_edge_property("bool") relations = list(g.properties[("e", "relation")]) for edge, edge_relation in tqdm(zip(g.edges(), relations), desc="Edge filtering...", total=len(relations)): e_hierarchical_relation_filter[edge] = edge_relation in relation_types g.set_edge_filter(e_hierarchical_relation_filter) vertices = dict(zip(g.vertex_properties["aspect_name"], g.vertices())) return g, vertices
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-g', '--graph', help='graph name') parser.add_argument('-d', '--to_directed', action='store_true', help='if make directed or not') parser.add_argument('--p_min', default=0.0, type=float, help='lower bound for edge weight') parser.add_argument('--p_max', default=1.0, type=float, help='upper bound for edge weight') parser.add_argument('-o', '--output') args = parser.parse_args() g = load_graph_by_name(args.graph) remove_self_loops(g) if args.to_directed: g.set_directed(True) edges_iter = list(g.edges()) for e in edges_iter: g.add_edge(e.target(), e.source()) weights = g.new_edge_property('float') weights.a = np.random.random( g.num_edges()) * (args.p_max - args.p_min) + args.p_min g.edge_properties["weights"] = weights g.graph_properties['p_min'] = g.new_graph_property("float", args.p_min) g.graph_properties['p_max'] = g.new_graph_property("float", args.p_max) print(g.graph_properties['p_min'], args.p_min) print(g.graph_properties['p_max'], args.p_max) print('g.num_edges()', g.num_edges()) output_path = args.output # 'data/{}/graph_weighted.gt'.format(args.graph) g.save(output_path) print('dumped to {}'.format(output_path))
def make_simple_graph(g, undirected=True, gcc=True): ''' Returns input graph -g- in a version without parallel edges or self-loops. If undirected = True, returned graph is also undirected. If gcc = True, returned graph is giant connected component of g. ''' if undirected and g.is_directed: g.set_directed(False) gtStats.remove_self_loops(g) gtStats.remove_parallel_edges(g) if gcc: l = topology.label_largest_component( g) # Keep Largest Connected Component. print "Nodes in largest connected component: " + str(np.sum(l.a)) g.set_vertex_filter(l) g.purge_vertices() return g
def prepare_aspect_graph( experiment_paths: ExperimentPaths, ) -> Tuple[Graph, ExperimentPaths]: logger.info( f"Load aspect 2 aspect graph - {str(experiment_paths.aspect_to_aspect_graph)}" ) aspect_graph = serializer.load(experiment_paths.aspect_hierarchical_tree) mlflow.log_param("min_aspect_graph_degree", MIN_DEGREE) remove = [ node for node, degree in dict(aspect_graph.degree()).items() if degree > MIN_DEGREE ] print(f'nodes: {len(aspect_graph.nodes())}') aspect_graph.remove_nodes_from(remove) print(f'nodes: {len(aspect_graph.nodes())}') aspect_graph = networkx_2_graph_tool(aspect_graph, node_name_property="aspect_name") remove_self_loops(aspect_graph) aspect_graph.reindex_edges() return Graph(aspect_graph), experiment_paths
def loadGraphWithAnnotations(graphFile): ''' Used to read the graphs provides by RoleSim people, regarding scientific collaborations and the g/h index. ''' g = Graph(directed=False) with open(graphFile, "r") as inF: num_nodes = int(inF.readline().split()[1]) g.add_vertex(num_nodes) g_names = g.new_vertex_property("string") g_H_ind = g.new_vertex_property("int") g_G_ind = g.new_vertex_property("int") for i, line in enumerate(inF): # Read Meta-Data of Nodes if rstrip(line) == "*Edges": break contents = rstrip(line).split("\t") gID, name, gIndex, hIndex = contents[0], contents[1], int( contents[2]), int(contents[3]) assert (gID == str(i)) # print gID, name, gIndex, hIndex g_names[g.vertex(i)] = name g_H_ind.a[i] = gIndex g_G_ind.a[i] = hIndex for i, line in enumerate(inF): # Read Edges tokens = line.split() fromE, toE = int(tokens[0]), int(tokens[1]) g.add_edge(fromE, toE) g.vp["names"] = g_names g.vp["h-Index"] = g_H_ind g.vp["g-Index"] = g_G_ind gtStats.remove_parallel_edges(g) gtStats.remove_self_loops(g) l = topology.label_largest_component(g) #Keep Largest Connected Component g.set_vertex_filter(l) g.purge_vertices() return g
def genGraphFromFile(self,fileName): lstFileStrings = [line.strip().rstrip(' ') for line in open(fileName)] # dic where will store the graph's properties lstStringProp = ["Name", "Distribution", "Type"] lstIntProp = ["Nodes", "Edges"] dicProp = {} graph = GraphClass() i = 0 # count lines before neighbour list # load graph properties while lstFileStrings[i][0] == "#": # end of the prop. name idxPropNameStart = 2 if lstFileStrings[i][1] == " " else 1 nIdxEndPropName = lstFileStrings[i].find(" ",idxPropNameStart) strProp = lstFileStrings[i][idxPropNameStart:nIdxEndPropName] # prop. value strValue = lstFileStrings[i][nIdxEndPropName + 1:] if strProp == "Weighted": dicProp[strProp] = True if (strValue == "True") else False elif strProp == "Input": None elif not (strProp in lstStringProp): if strProp in lstIntProp: dicProp[strProp] = int(strValue) else: dicProp[strProp] = float(strValue) else: dicProp[strProp] = strValue i+=1 # if there was no header, patch up: if i ==0: nNodes = len(lstFileStrings) dicProp["Nodes"] = nNodes dicProp["Edges"] = nNodes*(nNodes-1) dicProp["Density"] = (nNodes-1)/float(nNodes) dicProp["Name"] = "Graph_{}".format(len(self.parent.lstGraphs)) graph.get_graph().add_vertex(dicProp["Nodes"]) lstEdges = np.zeros((2,dicProp["Edges"])) lstWeights = np.zeros(dicProp["Edges"]) # load graph idxEdge = 0 for j in range(i,len(lstFileStrings)): strLine = lstFileStrings[j] idxNextSpace = strLine.find(" ",0) # get all neighbours and connections strength for current vertex i while idxNextSpace != -1: # put the vertices in the edges list lstEdges[0,idxEdge] = j-i idxEndVertNumber = strLine.find(";",idxNextSpace+1) lstEdges[1,idxEdge] = strLine[idxNextSpace+1:idxEndVertNumber] # get the connection's weight idxNextSpace = strLine.find(" ",idxEndVertNumber) if idxNextSpace == -1: lstWeights[idxEdge] = float(strLine[idxEndVertNumber+1:len(strLine)]) else: lstWeights[idxEdge] = float(strLine[idxEndVertNumber+1:idxNextSpace]) idxEdge += 1 graph.get_graph().add_edge_list(np.transpose(lstEdges.astype(int))) # add the edges' properties lstTypes = np.sign(lstWeights) lstWeights = np.absolute(lstWeights) epropType = graph.get_graph().new_edge_property("int",lstTypes) graph.get_graph().edge_properties["type"] = epropType try: if dicProp["Weighted"]: epropWeights = graph.get_graph().new_edge_property("double",lstWeights) graph.get_graph().edge_properties["weight"] = epropWeights except: if np.ma.allequal(np.trunc(lstWeights), lstWeights): graph.setProp("Weighted",False) else: graph.setProp("Weighted",True) epropWeights = graph.get_graph().new_edge_property("double",lstWeights) graph.get_graph().edge_properties["weight"] = epropWeights # put the graph inside the list and update comboBox remove_self_loops(graph.get_graph()) graph.update_prop() self.parent.new_graph_added(graph,) return graph
def motif_operation(mid, cnt_mid): # ''' The steps of computing the exposure nodes are as follows: 1. For each pair of intervals of a cascde [I-1, I] for I \in [1, I_C], form the cascade + historical network N_I. 2. Compute the network motifs of size 3 exhibited in N_I. 3. For each node $u$ activated in interval I, see whether $u$ was a part of any of the instances belonging to the 6 motif patterns mentioned in the paper such that $u$ had participated in the those instances after the other two nodes, one of which must be the source 4. Add the third node in those selected instances barring the parent and $u$ (which together form the 3-motif instance) in the exposure set of $u$ for the cascade C. 5. This step imposes the AND gating constraint to remove nodes that violate the boolean TRUE threshold constraint. ''' # 1. cascade_set = dataDf[dataDf['mid'] == mid] print("Cascade : ", cnt_mid, " and reshare size: ", len(cascade_set)) numIntervals = np.max(np.array(list(cascade_set['interval_2:']))) last_time = 0 inf_nodes = { } # stores the influential nodes apart from parent for each node retweet inf_edges = { } # stores the influential edges between nodes apart from parent for each node retweet motif_graph_patterns = [[] for _ in range(500)] for int_idx in range(1, numIntervals): # print("Interval: ", int_idx) ''' Operation 1. ''' cascade_intervalDf_prev = cascade_set[cascade_set['interval_1'] == int_idx - 1] cascade_intervalDf_curr = cascade_set[cascade_set['interval_1'] == int_idx] cascade_intervalDf = pd.concat( [cascade_intervalDf_prev, cascade_intervalDf_curr]) cascade_Df = cascade_intervalDf[cascade_intervalDf['edge_type'] == 'cascade'] historical_Df = cascade_intervalDf[cascade_intervalDf['edge_type'] == 'historical'] # Create the vertex time dictionary vertex_rtTime_dict = {} for i, r in cascade_intervalDf.iterrows(): if r['edge_type'] == 'historical': continue src = r['source'] tgt = r['target'] vertex_rtTime_dict[tgt] = r['retweet_time'] if src not in vertex_rtTime_dict: vertex_rtTime_dict[src] = last_time last_time = r['retweet_time'] # Store the cascade edges edges_cascade = [] edges_historical = [] for i, r in cascade_intervalDf.iterrows(): src = r['source'] tgt = r['target'] if r['edge_type'] == 'cascade': edges_cascade.append((src, tgt)) else: edges_historical.append((src, tgt)) ''' Operation 2. ''' cascade_graph = gt.Graph(directed=True) node_cascades_diff_map = {} cnt_nodes = 0 cascade_vertices = cascade_graph.new_vertex_property("string") cascade_edge_prop = cascade_graph.new_edge_property("int") cascade_map_write_file = {} # Add the cascade edges # 0 - Cascade edges # 1 - Diffusion edges for i, r in cascade_Df.iterrows(): src = r['source'] tgt = r['target'] if src not in node_cascades_diff_map: node_cascades_diff_map[ src] = cnt_nodes # map from user ID to graphnode ID v1 = cascade_graph.add_vertex() cascade_vertices[v1] = src # map from graphnode ID to user ID cascade_map_write_file[cnt_nodes] = src cnt_nodes += 1 else: v1 = cascade_graph.vertex(node_cascades_diff_map[src]) if tgt not in node_cascades_diff_map: node_cascades_diff_map[ tgt] = cnt_nodes # map from user ID to graphnode ID v2 = cascade_graph.add_vertex() cascade_vertices[v2] = tgt # map from graphnode ID to user ID cascade_map_write_file[cnt_nodes] = tgt cnt_nodes += 1 else: v2 = cascade_graph.vertex(node_cascades_diff_map[tgt]) if cascade_graph.edge(v1, v2): continue else: e = cascade_graph.add_edge(v1, v2) cascade_edge_prop[e] = 0 gts.remove_parallel_edges(cascade_graph) # Add the historical diffusion edges (even if there already exists a cascade edge, but only once) edges_seen = [] for i, r in historical_Df.iterrows(): src = r['source'] tgt = r['target'] v1 = node_cascades_diff_map[src] v2 = node_cascades_diff_map[tgt] if (v1, v2) in edges_seen: continue edges_seen.append((v1, v2)) e = cascade_graph.add_edge(v1, v2) cascade_edge_prop[e] = 1 gts.remove_self_loops(cascade_graph) # gts.remove_parallel_edges(cascade_graph) '''' Operation 3. ''' # FINDING THE MOTIFS IN THE CASCADE GRAPH + DIFFUSION NETWORK - SIZE 3 motifs_graph, motifs_count, vertex_maps = \ gt.clustering.motifs(cascade_graph, 3, return_maps=True) # Store the motif patterns interval wise for retrieval later for idx_pat in range(len(motifs_graph)): motif_graph_patterns[int_idx - 1].append(motifs_graph[idx_pat]) '''' Operation 4. ''' # Find the influential nodes for each node in the retweet cascade for the current interval only - NOT the previous interval for i, r in cascade_intervalDf_curr.iterrows(): src = r['source'] tgt = r['target'] if tgt in inf_nodes: continue # extract the graph node IDs src_vert = node_cascades_diff_map[src] tgt_vert = node_cascades_diff_map[tgt] # extract the vertex timestamps src_rtTime = vertex_rtTime_dict[src] tgt_rtTime = vertex_rtTime_dict[tgt] # only consider the cascade retweets for (src, tgt) pair edge_type = r['edge_type'] if edge_type == 'historical': continue # find the motifs of particular patterns attached to that pair of src and tgt # Patterns - [M4, M7, M16, M17, M23, M25] # Patterns handcoded - can be automated into lists or arrays for more efficiency graph_pat_act_M4 = motif_patterns_dict['M4'] graph_pat_act_M7 = motif_patterns_dict['M7'] graph_pat_act_M16 = motif_patterns_dict['M16'] graph_pat_act_M23 = motif_patterns_dict['M23'] graph_pat_act_M25 = motif_patterns_dict['M25'] graph_pat_act_M31 = motif_patterns_dict['M31'] # Extract the motif instances belonging to this pattern for idx_map in range(len(motifs_graph)): graph_pat_curr = motifs_graph[idx_map] # check if the instance belongs to any of these patterns if (not gtt.isomorphism(graph_pat_act_M4, graph_pat_curr) ) and (not gtt.isomorphism(graph_pat_act_M7, graph_pat_curr) ) \ and (not gtt.isomorphism(graph_pat_act_M16, graph_pat_curr) ) and (not gtt.isomorphism(graph_pat_act_M23, graph_pat_curr) ) \ and (not gtt.isomorphism(graph_pat_act_M25, graph_pat_curr)) and (not gtt.isomorphism(graph_pat_act_M31, graph_pat_curr) ): continue # for M in motif_patterns_dict: # if gtt.isomorphism(motif_patterns_dict[M], graph_pat_curr): # print(M, motifs_count[idx_map]) # return # 1st constraint: Traverse through all the motif instances of this pattern that only contain the (src, tgt) cascade edge vMaps = vertex_maps[idx_map] # print(len(vMaps)) cnt_maps = 0 for vertices in vMaps: # print('hello....') # Cond. 1: the source and target should be in the motif instance vertex_list = list(vertices.a) if src_vert not in vertex_list or tgt_vert not in vertex_list: continue # print('hello1') # Find the non-source and non-target vertex for v in vertex_list: if v != src_vert and v != tgt_vert: third_vertex = cascade_vertices[ v] # this is the potential non-parent exposure node to target node break # print('hello2') # Cond. 2: the target vertex should have retweeted last among all the motif vertices third_rtTime = vertex_rtTime_dict[third_vertex] max_time = max([tgt_rtTime, src_rtTime, third_rtTime]) if max_time != tgt_rtTime: continue # For different motif patterns, need to check different types of motif edges - this is difficult !! # print(tgt, third_vertex) if tgt not in inf_nodes: inf_nodes[tgt] = [] inf_edges[tgt] = [] inf_nodes[tgt].append(third_vertex) inf_nodes[tgt] = list(set(inf_nodes[tgt])) # if (third_vertex, tgt) in edges_cas_curr: # inf_edges[tgt].append((third_vertex, tgt, 'cascade')) # elif (third_vertex, tgt) in edges_hist_curr: # inf_edges[tgt].append((third_vertex, tgt, 'historical')) ''' Operation 5. ''' thresh = 5 * 0.00001 for node in inf_nodes: time_dict = {} exposure_nodes = inf_nodes[node] for e in exposure_nodes: time_dict[e] = vertex_rtTime_dict[e] sorted_dict = sorted(time_dict.items()) product = np.prod(np.array(list(sorted_dict.values()))) val = list(sorted_dict.values()) while product < thresh: val = val[1:] inf_nodes[node] = np.find(sorted_dict, val) # Create a dataframe with the rows as original cascade_Df = cascade_set[cascade_set['edge_type'] == 'cascade'] inf_nodes_df = [] count_row = 0 for idx, row in cascade_Df.iterrows(): tgt = row['target'] if tgt not in inf_nodes: inf_nodes_df.append([]) else: inf_nodes_df.append(inf_nodes[tgt]) count_row += 1 cascade_Df['exposureNodes'] = inf_nodes_df return cascade_Df