Esempio n. 1
0
def SimulateSbm(sbm_data,
                num_vertices,
                num_edges,
                pi,
                prop_mat,
                out_degs=None):
    """Generates a stochastic block model, storing data in sbm_data.graph.

  This function uses graph_tool.generate_sbm. Refer to that
  documentation for more information on the model and parameters.

  Args:
    sbm_data: StochasticBlockModel dataclass to store result data.
    num_vertices: (int) number of nodes in the graph.
    num_edges: (int) expected number of edges in the graph.
    pi: interable of non-zero community size proportions. Must sum to 1.0.
    prop_mat: square, symmetric matrix of community edge count rates.
    out_degs: Out-degree propensity for each node. If not provided, a constant
      value will be used. Note that the values will be normalized inside each
      group, if they are not already so.
  Returns: (none)
  """
    if np.sum(pi) != 1.0:
        raise ValueError("entries of pi must sum to 1.0")
    if prop_mat.shape[0] != len(pi) or prop_mat.shape[1] != len(pi):
        raise ValueError("prop_mat must be k x k where k = len(pi)")
    sbm_data.graph_memberships = _GenerateNodeMemberships(num_vertices, pi)
    edge_counts = _ComputeExpectedEdgeCounts(num_edges, num_vertices, pi,
                                             prop_mat)
    sbm_data.graph = generation.generate_sbm(sbm_data.graph_memberships,
                                             edge_counts, out_degs)
    stats.remove_self_loops(sbm_data.graph)
    stats.remove_parallel_edges(sbm_data.graph)
    sbm_data.graph.reindex_edges()
Esempio n. 2
0
def load_GT_graph(graphExample, gcc=False, removeSL=False, removePE=False):
    '''
    Input:  - graphExample,  graph example from Graph-tool collections  (e.g.,  'cond-mat-2003', 'adjnoun' 'karate' 'netscience') or a graphfile in .gml format
            - gcc = True if only the giant connected component should be returned
            - removeSL = True if any self-loops must be removed 
            - removePE = True if any parallel-edge must be removed
    Output: the corresponding graph_tool graph object
    '''

    if graphExample[-4:] == ".gml":
        g = load_graph(graphExample)
    else:
        g = collection.data[graphExample]

    if g.is_directed:
        g.set_directed(False)


#         g = Graph(g, directed=False)

    if removePE:
        gtStats.remove_parallel_edges(g)
    if removeSL:
        gtStats.remove_self_loops(g)
    if gcc:
        l = topology.label_largest_component(
            g)  #Keep Largest Connected Component
        g.set_vertex_filter(l)
        #g = GraphView(g, vfilt=l)
    g.purge_vertices()

    return g
Esempio n. 3
0
def prepare_conceptnet(
        graph_path: Union[str, Path]) -> Tuple[Graph, Dict[str, gt.Vertex]]:
    logger.info(f"Load conceptnet graph - {str(graph_path)}")
    conceptnet_graph = gt.load_graph(str(graph_path))
    logger.info(f"Loaded conceptnet graph - {str(graph_path)}")
    remove_self_loops(conceptnet_graph)
    conceptnet_graph.reindex_edges()
    logger.info(f"Generate aspect name to vertex mapping  - {str(graph_path)}")
    vertices_conceptnet = dict(
        zip(
            conceptnet_graph.vertex_properties["aspect_name"],
            conceptnet_graph.vertices(),
        ))
    return Graph(conceptnet_graph), vertices_conceptnet
def prepare_conceptnet_graph(graph_path: str, relation_types: Set[str]):
    g = gt.load_graph(graph_path)
    remove_self_loops(g)
    g.reindex_edges()

    # filter relations
    e_hierarchical_relation_filter = g.new_edge_property("bool")
    relations = list(g.properties[("e", "relation")])
    for edge, edge_relation in tqdm(zip(g.edges(), relations),
                                    desc="Edge filtering...",
                                    total=len(relations)):
        e_hierarchical_relation_filter[edge] = edge_relation in relation_types
    g.set_edge_filter(e_hierarchical_relation_filter)

    vertices = dict(zip(g.vertex_properties["aspect_name"], g.vertices()))

    return g, vertices
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-g', '--graph', help='graph name')
    parser.add_argument('-d',
                        '--to_directed',
                        action='store_true',
                        help='if make directed or not')
    parser.add_argument('--p_min',
                        default=0.0,
                        type=float,
                        help='lower bound for edge weight')
    parser.add_argument('--p_max',
                        default=1.0,
                        type=float,
                        help='upper bound for edge weight')
    parser.add_argument('-o', '--output')

    args = parser.parse_args()
    g = load_graph_by_name(args.graph)
    remove_self_loops(g)

    if args.to_directed:
        g.set_directed(True)
        edges_iter = list(g.edges())
        for e in edges_iter:
            g.add_edge(e.target(), e.source())

    weights = g.new_edge_property('float')
    weights.a = np.random.random(
        g.num_edges()) * (args.p_max - args.p_min) + args.p_min

    g.edge_properties["weights"] = weights

    g.graph_properties['p_min'] = g.new_graph_property("float", args.p_min)
    g.graph_properties['p_max'] = g.new_graph_property("float", args.p_max)
    print(g.graph_properties['p_min'], args.p_min)
    print(g.graph_properties['p_max'], args.p_max)
    print('g.num_edges()', g.num_edges())

    output_path = args.output  # 'data/{}/graph_weighted.gt'.format(args.graph)
    g.save(output_path)

    print('dumped to {}'.format(output_path))
Esempio n. 6
0
def make_simple_graph(g, undirected=True, gcc=True):
    '''
    Returns input graph -g- in a version without parallel edges or self-loops.
    If undirected = True, returned graph is also undirected.
    If gcc        = True, returned graph is giant connected component of g.
    '''

    if undirected and g.is_directed:
        g.set_directed(False)

    gtStats.remove_self_loops(g)
    gtStats.remove_parallel_edges(g)

    if gcc:
        l = topology.label_largest_component(
            g)  # Keep Largest Connected Component.
        print "Nodes in largest connected component: " + str(np.sum(l.a))
        g.set_vertex_filter(l)
    g.purge_vertices()
    return g
Esempio n. 7
0
def prepare_aspect_graph(
    experiment_paths: ExperimentPaths, ) -> Tuple[Graph, ExperimentPaths]:
    logger.info(
        f"Load aspect 2 aspect graph - {str(experiment_paths.aspect_to_aspect_graph)}"
    )
    aspect_graph = serializer.load(experiment_paths.aspect_hierarchical_tree)

    mlflow.log_param("min_aspect_graph_degree", MIN_DEGREE)
    remove = [
        node for node, degree in dict(aspect_graph.degree()).items()
        if degree > MIN_DEGREE
    ]
    print(f'nodes: {len(aspect_graph.nodes())}')
    aspect_graph.remove_nodes_from(remove)
    print(f'nodes: {len(aspect_graph.nodes())}')

    aspect_graph = networkx_2_graph_tool(aspect_graph,
                                         node_name_property="aspect_name")
    remove_self_loops(aspect_graph)
    aspect_graph.reindex_edges()
    return Graph(aspect_graph), experiment_paths
Esempio n. 8
0
def loadGraphWithAnnotations(graphFile):
    '''
    Used to read the graphs provides by RoleSim people, regarding scientific collaborations
    and the g/h index.
    '''
    g = Graph(directed=False)
    with open(graphFile, "r") as inF:
        num_nodes = int(inF.readline().split()[1])
        g.add_vertex(num_nodes)
        g_names = g.new_vertex_property("string")
        g_H_ind = g.new_vertex_property("int")
        g_G_ind = g.new_vertex_property("int")

        for i, line in enumerate(inF):  # Read Meta-Data of Nodes
            if rstrip(line) == "*Edges": break
            contents = rstrip(line).split("\t")
            gID, name, gIndex, hIndex = contents[0], contents[1], int(
                contents[2]), int(contents[3])
            assert (gID == str(i))
            #             print gID, name, gIndex, hIndex
            g_names[g.vertex(i)] = name
            g_H_ind.a[i] = gIndex
            g_G_ind.a[i] = hIndex

        for i, line in enumerate(inF):  # Read Edges
            tokens = line.split()
            fromE, toE = int(tokens[0]), int(tokens[1])
            g.add_edge(fromE, toE)

        g.vp["names"] = g_names
        g.vp["h-Index"] = g_H_ind
        g.vp["g-Index"] = g_G_ind

    gtStats.remove_parallel_edges(g)
    gtStats.remove_self_loops(g)
    l = topology.label_largest_component(g)  #Keep Largest Connected Component
    g.set_vertex_filter(l)
    g.purge_vertices()

    return g
Esempio n. 9
0
	def genGraphFromFile(self,fileName):
		lstFileStrings = [line.strip().rstrip(' ') for line in open(fileName)]
		# dic where will store the graph's properties
		lstStringProp = ["Name", "Distribution", "Type"]
		lstIntProp = ["Nodes", "Edges"]
		dicProp = {}
		graph = GraphClass()
		i = 0 # count lines before neighbour list
		# load graph properties
		while lstFileStrings[i][0] == "#":
			# end of the prop. name
			idxPropNameStart = 2 if lstFileStrings[i][1] == " " else 1
			nIdxEndPropName = lstFileStrings[i].find(" ",idxPropNameStart)
			strProp = lstFileStrings[i][idxPropNameStart:nIdxEndPropName]
			# prop. value
			strValue = lstFileStrings[i][nIdxEndPropName + 1:]
			if strProp == "Weighted":
				dicProp[strProp] = True if (strValue == "True") else False
			elif strProp == "Input":
				None
			elif not (strProp in lstStringProp):
				if strProp in lstIntProp:
					dicProp[strProp] = int(strValue)
				else:
					dicProp[strProp] = float(strValue)
			else:
				dicProp[strProp] = strValue
			i+=1
		# if there was no header, patch up:
		if i ==0:
			nNodes = len(lstFileStrings)
			dicProp["Nodes"] = nNodes
			dicProp["Edges"] = nNodes*(nNodes-1)
			dicProp["Density"] = (nNodes-1)/float(nNodes)
			dicProp["Name"] = "Graph_{}".format(len(self.parent.lstGraphs))
		graph.get_graph().add_vertex(dicProp["Nodes"])
		lstEdges = np.zeros((2,dicProp["Edges"]))
		lstWeights = np.zeros(dicProp["Edges"])
		# load graph
		idxEdge = 0
		for j in range(i,len(lstFileStrings)):
			strLine = lstFileStrings[j]
			idxNextSpace = strLine.find(" ",0)
			# get all neighbours and connections strength for current vertex i
			while idxNextSpace != -1:
				# put the vertices in the edges list
				lstEdges[0,idxEdge] = j-i
				idxEndVertNumber = strLine.find(";",idxNextSpace+1)
				lstEdges[1,idxEdge] = strLine[idxNextSpace+1:idxEndVertNumber]
				# get the connection's weight
				idxNextSpace = strLine.find(" ",idxEndVertNumber)
				if idxNextSpace == -1:
					lstWeights[idxEdge] = float(strLine[idxEndVertNumber+1:len(strLine)])
				else:
					lstWeights[idxEdge] = float(strLine[idxEndVertNumber+1:idxNextSpace])
				idxEdge += 1
		graph.get_graph().add_edge_list(np.transpose(lstEdges.astype(int)))
		# add the edges' properties
		lstTypes = np.sign(lstWeights)
		lstWeights = np.absolute(lstWeights)
		epropType = graph.get_graph().new_edge_property("int",lstTypes)
		graph.get_graph().edge_properties["type"] = epropType
		try:
			if dicProp["Weighted"]:
				epropWeights = graph.get_graph().new_edge_property("double",lstWeights)
				graph.get_graph().edge_properties["weight"] = epropWeights
		except:
			if np.ma.allequal(np.trunc(lstWeights), lstWeights):
				graph.setProp("Weighted",False)
			else:
				graph.setProp("Weighted",True)
				epropWeights = graph.get_graph().new_edge_property("double",lstWeights)
				graph.get_graph().edge_properties["weight"] = epropWeights
		# put the graph inside the list and update comboBox
		remove_self_loops(graph.get_graph())
		graph.update_prop()
		self.parent.new_graph_added(graph,)
		return graph
def motif_operation(mid, cnt_mid):
    #
    ''' The steps of computing the exposure nodes are as follows:
        1. For each pair of intervals of a cascde [I-1, I] for I \in [1, I_C], form the cascade + historical network N_I.
        2. Compute the network motifs of size 3 exhibited in N_I.
        3. For each node $u$ activated in interval I, see whether $u$ was a part of any of
           the instances belonging to the 6 motif patterns mentioned in the paper such that $u$ had participated in the
           those instances after the other two nodes, one of which must be the source
        4. Add the third node in those selected instances barring the parent and $u$ (which together form the 3-motif instance)
           in the exposure set of $u$ for the cascade C.
        5. This step imposes the AND gating constraint to remove nodes that violate the boolean TRUE threshold constraint.
    '''
    # 1.
    cascade_set = dataDf[dataDf['mid'] == mid]

    print("Cascade : ", cnt_mid, " and reshare size: ", len(cascade_set))

    numIntervals = np.max(np.array(list(cascade_set['interval_2:'])))
    last_time = 0
    inf_nodes = {
    }  # stores the influential nodes apart from parent for each node retweet
    inf_edges = {
    }  # stores the influential edges between nodes apart from parent for each node retweet
    motif_graph_patterns = [[] for _ in range(500)]
    for int_idx in range(1, numIntervals):
        # print("Interval: ", int_idx)
        ''' Operation 1. '''
        cascade_intervalDf_prev = cascade_set[cascade_set['interval_1'] ==
                                              int_idx - 1]
        cascade_intervalDf_curr = cascade_set[cascade_set['interval_1'] ==
                                              int_idx]
        cascade_intervalDf = pd.concat(
            [cascade_intervalDf_prev, cascade_intervalDf_curr])

        cascade_Df = cascade_intervalDf[cascade_intervalDf['edge_type'] ==
                                        'cascade']
        historical_Df = cascade_intervalDf[cascade_intervalDf['edge_type'] ==
                                           'historical']

        # Create the vertex time dictionary
        vertex_rtTime_dict = {}
        for i, r in cascade_intervalDf.iterrows():
            if r['edge_type'] == 'historical':
                continue

            src = r['source']
            tgt = r['target']
            vertex_rtTime_dict[tgt] = r['retweet_time']
            if src not in vertex_rtTime_dict:
                vertex_rtTime_dict[src] = last_time

            last_time = r['retweet_time']

        # Store the cascade edges
        edges_cascade = []
        edges_historical = []
        for i, r in cascade_intervalDf.iterrows():
            src = r['source']
            tgt = r['target']

            if r['edge_type'] == 'cascade':
                edges_cascade.append((src, tgt))
            else:
                edges_historical.append((src, tgt))
        ''' Operation 2. '''
        cascade_graph = gt.Graph(directed=True)
        node_cascades_diff_map = {}
        cnt_nodes = 0
        cascade_vertices = cascade_graph.new_vertex_property("string")
        cascade_edge_prop = cascade_graph.new_edge_property("int")
        cascade_map_write_file = {}

        # Add the cascade edges
        # 0 - Cascade edges
        # 1 - Diffusion edges

        for i, r in cascade_Df.iterrows():
            src = r['source']
            tgt = r['target']
            if src not in node_cascades_diff_map:
                node_cascades_diff_map[
                    src] = cnt_nodes  # map from user ID to graphnode ID
                v1 = cascade_graph.add_vertex()
                cascade_vertices[v1] = src  # map from graphnode ID to user ID
                cascade_map_write_file[cnt_nodes] = src
                cnt_nodes += 1
            else:
                v1 = cascade_graph.vertex(node_cascades_diff_map[src])

            if tgt not in node_cascades_diff_map:
                node_cascades_diff_map[
                    tgt] = cnt_nodes  # map from user ID to graphnode ID
                v2 = cascade_graph.add_vertex()
                cascade_vertices[v2] = tgt  # map from graphnode ID to user ID
                cascade_map_write_file[cnt_nodes] = tgt
                cnt_nodes += 1
            else:
                v2 = cascade_graph.vertex(node_cascades_diff_map[tgt])

            if cascade_graph.edge(v1, v2):
                continue
            else:
                e = cascade_graph.add_edge(v1, v2)
                cascade_edge_prop[e] = 0

        gts.remove_parallel_edges(cascade_graph)

        # Add the historical diffusion edges (even if there already exists a cascade edge, but only once)
        edges_seen = []
        for i, r in historical_Df.iterrows():
            src = r['source']
            tgt = r['target']
            v1 = node_cascades_diff_map[src]
            v2 = node_cascades_diff_map[tgt]

            if (v1, v2) in edges_seen:
                continue

            edges_seen.append((v1, v2))
            e = cascade_graph.add_edge(v1, v2)
            cascade_edge_prop[e] = 1

        gts.remove_self_loops(cascade_graph)
        # gts.remove_parallel_edges(cascade_graph)
        '''' Operation 3. '''
        # FINDING THE MOTIFS IN THE CASCADE GRAPH + DIFFUSION NETWORK - SIZE 3
        motifs_graph, motifs_count, vertex_maps = \
            gt.clustering.motifs(cascade_graph, 3, return_maps=True)

        # Store the motif patterns interval wise for retrieval later
        for idx_pat in range(len(motifs_graph)):
            motif_graph_patterns[int_idx - 1].append(motifs_graph[idx_pat])
        '''' Operation 4. '''
        # Find the influential nodes for each node in the retweet cascade for the current interval only - NOT the previous interval
        for i, r in cascade_intervalDf_curr.iterrows():
            src = r['source']
            tgt = r['target']
            if tgt in inf_nodes:
                continue

            # extract the graph node IDs
            src_vert = node_cascades_diff_map[src]
            tgt_vert = node_cascades_diff_map[tgt]
            # extract the vertex timestamps
            src_rtTime = vertex_rtTime_dict[src]
            tgt_rtTime = vertex_rtTime_dict[tgt]

            # only consider the cascade retweets for (src, tgt) pair
            edge_type = r['edge_type']
            if edge_type == 'historical':
                continue

            # find the motifs of particular patterns attached to that pair of src and tgt
            # Patterns - [M4, M7, M16, M17, M23, M25]

            # Patterns handcoded - can be automated into lists or arrays for more efficiency
            graph_pat_act_M4 = motif_patterns_dict['M4']
            graph_pat_act_M7 = motif_patterns_dict['M7']
            graph_pat_act_M16 = motif_patterns_dict['M16']
            graph_pat_act_M23 = motif_patterns_dict['M23']
            graph_pat_act_M25 = motif_patterns_dict['M25']
            graph_pat_act_M31 = motif_patterns_dict['M31']

            # Extract the motif instances belonging to this pattern
            for idx_map in range(len(motifs_graph)):
                graph_pat_curr = motifs_graph[idx_map]
                # check if the instance belongs to any of these patterns
                if (not gtt.isomorphism(graph_pat_act_M4, graph_pat_curr) ) and (not gtt.isomorphism(graph_pat_act_M7, graph_pat_curr) ) \
                    and (not gtt.isomorphism(graph_pat_act_M16, graph_pat_curr) ) and (not gtt.isomorphism(graph_pat_act_M23, graph_pat_curr) ) \
                        and (not gtt.isomorphism(graph_pat_act_M25, graph_pat_curr)) and (not gtt.isomorphism(graph_pat_act_M31, graph_pat_curr) ):
                    continue

                # for M in motif_patterns_dict:
                #     if gtt.isomorphism(motif_patterns_dict[M], graph_pat_curr):
                #         print(M, motifs_count[idx_map])

                # return

                # 1st constraint: Traverse through all the motif instances of this pattern that only contain the (src, tgt) cascade edge
                vMaps = vertex_maps[idx_map]
                # print(len(vMaps))
                cnt_maps = 0
                for vertices in vMaps:
                    # print('hello....')
                    # Cond. 1: the source and target should be in the motif instance
                    vertex_list = list(vertices.a)
                    if src_vert not in vertex_list or tgt_vert not in vertex_list:
                        continue
                    # print('hello1')

                    # Find the non-source and non-target vertex
                    for v in vertex_list:
                        if v != src_vert and v != tgt_vert:
                            third_vertex = cascade_vertices[
                                v]  # this is the potential non-parent exposure node to target node
                            break

                    # print('hello2')
                    # Cond. 2: the target vertex should have retweeted last among all the motif vertices
                    third_rtTime = vertex_rtTime_dict[third_vertex]
                    max_time = max([tgt_rtTime, src_rtTime, third_rtTime])
                    if max_time != tgt_rtTime:
                        continue

                    # For different motif patterns, need to check different types of motif edges - this is difficult !!

                    # print(tgt, third_vertex)
                    if tgt not in inf_nodes:
                        inf_nodes[tgt] = []
                        inf_edges[tgt] = []

                    inf_nodes[tgt].append(third_vertex)
                    inf_nodes[tgt] = list(set(inf_nodes[tgt]))
                    # if (third_vertex, tgt) in edges_cas_curr:
                    #     inf_edges[tgt].append((third_vertex, tgt, 'cascade'))
                    # elif (third_vertex, tgt) in edges_hist_curr:
                    #     inf_edges[tgt].append((third_vertex, tgt, 'historical'))
        ''' Operation 5. '''
        thresh = 5 * 0.00001
        for node in inf_nodes:
            time_dict = {}
            exposure_nodes = inf_nodes[node]
            for e in exposure_nodes:
                time_dict[e] = vertex_rtTime_dict[e]

            sorted_dict = sorted(time_dict.items())

            product = np.prod(np.array(list(sorted_dict.values())))
            val = list(sorted_dict.values())
            while product < thresh:
                val = val[1:]

            inf_nodes[node] = np.find(sorted_dict, val)

    #  Create a dataframe with the rows as original
    cascade_Df = cascade_set[cascade_set['edge_type'] == 'cascade']
    inf_nodes_df = []
    count_row = 0

    for idx, row in cascade_Df.iterrows():
        tgt = row['target']

        if tgt not in inf_nodes:
            inf_nodes_df.append([])
        else:
            inf_nodes_df.append(inf_nodes[tgt])

        count_row += 1

    cascade_Df['exposureNodes'] = inf_nodes_df

    return cascade_Df