def gen_test_node_wrt_changes(graph_t0, graph_t1): ''' generate the testing nodes that we are intereted here we take the affected nodes presented in both graphs ''' from utils import unique_nodes_from_edge_set G0 = graph_t0.copy() G1 = graph_t1.copy( ) # use copy to avoid problem caused by G1.remove_node(node) edge_add = edge_s1_minus_s0(s1=set(G1.edges()), s0=set(G0.edges())) edge_del = edge_s1_minus_s0(s1=set(G0.edges()), s0=set(G1.edges())) node_affected_by_edge_add = unique_nodes_from_edge_set(edge_add) # unique node_affected_by_edge_del = unique_nodes_from_edge_set(edge_del) # unique node_affected = list( set(node_affected_by_edge_add + node_affected_by_edge_del)) # unique node_add = [ node for node in node_affected_by_edge_add if node not in G0.nodes() ] # nodes not exist in G0 node_del = [ node for node in node_affected_by_edge_del if node not in G1.nodes() ] # nodes not exist in G1 not_intereted_node = node_add + node_del test_nodes = [ node for node in node_affected if node not in not_intereted_node ] # remove unseen nodes return test_nodes
def align_nodes(graph_t0, graph_t1): ''' remove newly added nodes from graph_t1, and add newly removed nodes to graph_t1 ''' from utils import unique_nodes_from_edge_set G0 = graph_t0.copy() G1 = graph_t1.copy() edge_add = edge_s1_minus_s0(s1=set(G1.edges()), s0=set( G0.edges())) # one may directly use edge streams if available edge_del = edge_s1_minus_s0(s1=set(G0.edges()), s0=set(G1.edges())) node_affected_by_edge_add = unique_nodes_from_edge_set(edge_add) node_affected_by_edge_del = unique_nodes_from_edge_set(edge_del) node_affected = list( set(node_affected_by_edge_add + node_affected_by_edge_del) ) # nodes being directly affected between G0 and G1 node_add = [ node for node in node_affected_by_edge_add if node not in G0.nodes() ] # nodes not exist in G0 node_del = [ node for node in node_affected_by_edge_del if node not in G1.nodes() ] # nodes not exist in G1 # to align G1 with G0 G1.remove_nodes_from(node_add) G1.add_nodes_from(node_del) return G1
def sampling(self, graph_t0, graph_t1, sampling_strategy, emb_dict_t0=None, w2v_models=None): ''' sampling strategies 1: naively repeat all affected nodes 2: sample with replancement with equal probability 3: sample with replancement with equal probability; partially e.g. 0.8 ''' t1 = time.time() G0 = graph_t0.copy() G1 = graph_t1.copy() edge_add = edge_s1_minus_s0(s1=set(G1.edges()), s0=set( G0.edges())) # one may directly use edge streams if available edge_del = edge_s1_minus_s0(s1=set(G0.edges()), s0=set(G1.edges())) node_affected_by_edge_add = unique_nodes_from_edge_set(edge_add) node_affected_by_edge_del = unique_nodes_from_edge_set(edge_del) node_affected = list( set(node_affected_by_edge_add + node_affected_by_edge_del) ) # nodes being directly affected between G0 and G1 node_add = [ node for node in node_affected_by_edge_add if node not in G0.nodes() ] # nodes not exist in G0 node_del = [ node for node in node_affected_by_edge_del if node not in G1.nodes() ] # nodes not exist in G1 node_sample_pool = list( set(node_affected) - set(node_del)) # nodes being directly affected in G1 print( f'# node being affected in current graph: {len(node_sample_pool)}') node_samples = [] # list of list # naively repeat all affected nodes if sampling_strategy == 1: print('S1: naively repeat all affected nodes') for i in range(self.num_base_models): node_samples.append(node_sample_pool) # sample with replancement with equal probability elif sampling_strategy == 2: print('S2: sample with replancement with equal probability') for i in range(self.num_base_models): node_samples.append( list( np.random.choice(node_sample_pool, size=len(node_sample_pool), replace=True))) # sample with replancement with equal probability; partially e.g. 80% elif sampling_strategy == 3: print( 'S3: sample with replancement with equal probability; partially e.g. 80%' ) frac = 0.80 for i in range(self.num_base_models): node_samples.append( list( np.random.choice(node_sample_pool, size=int(frac * len(node_sample_pool)), replace=True))) else: exit('Exit, sampling strategy not found ...') t2 = time.time() print(f'sampling time cost: {(t2-t1):.2f}') return node_samples, node_add, node_del
def gen_test_edge_wrt_changes(graph_t0, graph_t1, seed=None): ''' input: two networkx graphs generate **changed** testing edges for link prediction task currently, we only consider pos_neg_ratio = 1.0 return: pos_edges_with_label [(node1, node2, 1), (), ...] neg_edges_with_label [(node3, node4, 0), (), ...] ''' G0 = graph_t0.copy() G1 = graph_t1.copy( ) # use copy to avoid problem caused by G1.remove_node(node) edge_add = edge_s1_minus_s0(s1=set(G1.edges()), s0=set(G0.edges())) edge_del = edge_s1_minus_s0(s1=set(G0.edges()), s0=set(G1.edges())) unseen_nodes = set(G1.nodes()) - set(G0.nodes()) for node in unseen_nodes: # to avoid unseen nodes while testing G1.remove_node(node) edge_add_unseen_node = [] # to avoid unseen nodes while testing #print('len(edge_add)', len(edge_add)) for node in unseen_nodes: for edge in edge_add: if node in edge: edge_add_unseen_node.append(edge) edge_add = edge_add - set(edge_add_unseen_node) #print('len(edge_add)', len(edge_add)) neg_edges_with_label = [list(item + (0, )) for item in edge_del] pos_edges_with_label = [list(item + (1, )) for item in edge_add] random.seed(seed) all_nodes = list(G0.nodes()) if len(edge_add) > len(edge_del): num = len(edge_add) - len(edge_del) start_nodes = np.random.choice(all_nodes, num, replace=True) i = 0 for start_node in start_nodes: try: non_nbrs = list(nx.non_neighbors(G0, start_node)) non_nbr = random.sample(non_nbrs, 1).pop() non_edge = (start_node, non_nbr) if non_edge not in edge_del: neg_edges_with_label.append(list(non_edge + (0, ))) i += 1 if i >= num: break except: print('Found a fully connected node: ', start_node, 'Ignore it...') elif len(edge_add) < len(edge_del): num = len(edge_del) - len(edge_add) i = 0 for edge in nx.edges(G1): if edge not in edge_add: pos_edges_with_label.append(list(edge + (1, ))) i += 1 if i >= num: break else: # len(edge_add) == len(edge_del) pass print('---- len(pos_edges_with_label), len(neg_edges_with_label)', len(pos_edges_with_label), len(neg_edges_with_label)) return pos_edges_with_label, neg_edges_with_label
def node_selecting_scheme(graph_t0, graph_t1, reservoir_dict, limit=0.1, local_global=0.5): ''' select nodes to be updated 选择要更新的节点 G0: previous graph @ t-1; 前一时刻t-1的graph G0 G1: current graph @ t; 当前时刻t的graph G1 reservoir_dict: will be always maintained in ROM 不断维护的字典 limit: fix the number of node --> the percentage of nodes of a network to be updated (exclude new nodes) 除新节点外要更新节点的数量 local_global: # of nodes from recent changes v.s. from random nodes 局部感知与全局拓扑的均衡 ''' G0 = graph_t0.copy() G1 = graph_t1.copy() # 增加的边 edge_add = edge_s1_minus_s0(s1=set(G1.edges()), s0=set( G0.edges())) # one may directly use streaming added edges if possible # 删除的边 edge_del = edge_s1_minus_s0(s1=set(G0.edges()), s0=set( G1.edges())) # one may directly use streaming added edges if possible # 权重发生改变的边 edge_wei, common_edge = egde_weight_changed(G1=G1, G0=G0) node_affected_by_edge_add = unique_nodes_from_edge_set( edge_add) # unique 增加的边中所有的节点 node_affected_by_edge_del = unique_nodes_from_edge_set( edge_del) # unique 删除的边中所有的节点 node_affected_by_edge_wei = unique_nodes_from_edge_set( edge_wei) # unique 删除的边中所有的节点 node_affected = list( set(node_affected_by_edge_add + node_affected_by_edge_del + node_affected_by_edge_wei)) # unique 所有受影响的节点 node_add = [ node for node in node_affected_by_edge_add if node not in G0.nodes() ] # 增加的节点 node_del = [ node for node in node_affected_by_edge_del if node not in G1.nodes() ] # 删除的节点 # 从reservoir中删除消失的节点 if len(node_del) != 0: # 删除的节点不为0,即有消失的节点 reservoir_key_list = list(reservoir_dict.keys()) # reservoir中的keys for node in node_del: if node in reservoir_key_list: del reservoir_dict[ node] # if node being deleted, also delete it from reservoir # affected的节点既在G0中,又在G1中 exist_node_affected = list( set(node_affected) - set(node_add) - set(node_del)) # affected nodes are in both G0 and G1 attri_change = {} for node in exist_node_affected: attri_change[node] = np.linalg.norm( np.array(G0.nodes[node]["attribute"]) - np.array(G1.nodes[node]["attribute"]), ord=2) num_limit = int(G1.number_of_nodes() * limit) # 要更新节点的数量 local_limit = int(local_global * num_limit) # 局部感知节点的数量 global_limit = num_limit - local_limit # 全局拓扑节点的数量 node_update_list = [] # all the nodes to be updated 要更新节点的list # 选择 最受影响的节点 most_affected_nodes, reservoir_dict = select_most_affected_nodes( G0, G1, attri_change, local_limit, reservoir_dict, exist_node_affected) # 当有变化的节点少于 local_limit节点数量时,随机采样节点用作补偿 lack = local_limit - len( most_affected_nodes ) # if the changes are relatively smaller than local_limit, sample some random nodes for compensation # tabu节点为新增节点和最受影响节点的并集 tabu_nodes = set(node_add + most_affected_nodes) # 除tabu节点之外的其他节点 other_nodes = list(set(G1.nodes()) - tabu_nodes) # 从other_nodes中随机选择节点 random_nodes = list( np.random.choice(other_nodes, min(global_limit + lack, len(other_nodes)), replace=False)) # 待更新embedding的节点list node_update_list = node_add + most_affected_nodes + random_nodes reservoir_key_list = list(reservoir_dict.keys()) node_update_set = set( node_update_list) # remove repeated nodes due to resample 出去重复节点 # 已选则某个节点之后,从reservoir中删除,则下次重新开始累积该节点的变化 for node in node_update_set: if node in reservoir_key_list: del reservoir_dict[node] # if updated, delete it print( f'num_limit {num_limit}, local_limit {local_limit}, global_limit {global_limit}, # nodes updated {len(node_update_list)}' ) print(f'# nodes added {len(node_add)}, # nodes deleted {len(node_del)}') print( f'# nodes affected {len(node_affected)}, # nodes most affected {len(most_affected_nodes)}, # of random nodes {len(random_nodes)}' ) print( f'num of nodes in reservoir with accumulated changes but not updated {len(list(reservoir_dict))}' ) return node_update_list, reservoir_dict, node_del, node_add