def incremental_simulation(g, c, p, return_new_edges=False): visited = {v: False for v in np.arange(g.num_vertices())} new_c = copy(c) for v in infected_nodes(c): visited[v] = True if return_new_edges: new_edges = [] queue = list(infected_nodes(c)) while len(queue) > 0: u = queue.pop(0) uu = g.vertex(u) for e in uu.out_edges(): v = int(e.target()) if np.random.random() <= p[e] and not visited[v]: # active if return_new_edges: new_edges.append((u, v)) new_c[v] = c[u] + 1 visited[v] = True queue.append(v) if return_new_edges: return (new_c, new_edges) else: return new_c
def plot(self, c, X, probas, interception_func=None, setting_kwargs={}, uninfected_small=False, lighten_obs=True, lighten_prediction=False, highlight_missing_infection=False, color_mapper=None, **kwargs): setting = heatmap_plot_setting(self.g, c, X, probas, color_mapper=color_mapper, **setting_kwargs) if uninfected_small: uninfected = set(np.arange(len(c))) - set(infected_nodes(c)) # make terminals larger setting['node_size_info'][tuple( X)] = setting['node_size_info'][tuple(X)] * 1.5 # make uninfected smaller setting['node_size_info'][tuple( uninfected)] = setting['node_size_info']['default'] / 1.5 if lighten_obs: setting['node_color_info'][X] = 0 if lighten_prediction: depth = setting['node_color_info'] source = cascade_source(c) depth[depth == 1] = 0.5 depth[source] = 1 if highlight_missing_infection: missing = set(infected_nodes(c)) - set(X) - set( (probas == 1).nonzero()[0]) if interception_func is not None: interception_func(setting) visualize(self.g, self.pos, **setting, **kwargs)
def ic(g, p, source=None, stop_fraction=1.0, return_tree_edges=False): """ graph_tool version of simulating cascade return np.ndarray on vertices as the infection time in cascade uninfected node has dist -1 stop_fraction: detemines how large the snapshot is. """ if source is None: source = random.choice(np.arange(g.num_vertices(), dtype=int)) gv = sample_graph_by_p(g, p) times = get_infection_time(gv, source, return_edges=False) size = len(infected_nodes(times)) min_size = int(stop_fraction * g.num_vertices()) if size < min_size: # size does not fit, early stopping to save time raise CascadeTooSmall('{} < {}'.format(size, min_size)) stuff = get_infection_time(gv, source, return_edges=return_tree_edges) if not return_tree_edges: times = stuff tree_edges = None else: times, tree_edges = stuff # truncate the infection to fit size times[times == -1] = (times.max() + 1) uninfected = times.argsort()[min_size:] times[uninfected] = -1 if tree_edges is not None: inf_nodes = set(infected_nodes(times)) tree_edges = [ e for e in tree_edges if e[0] in inf_nodes and e[1] in inf_nodes ] return source, times, tree_edges
def incremental_simulation(g, c, p, num_nodes, return_new_edges=False): """incrementally add edges to given cascade num_nodes is passed bacause vfilt might be passed """ # print('incremental_simulation -> g', g) gv = sample_graph_by_p(g, p) new_infected_nodes = set(infected_nodes(c)) comp = label_components(gv)[0] covered_cids = set() for v in infected_nodes(c): cid = comp[v] if cid not in covered_cids: new_infected_nodes |= set((comp.a == cid).nonzero()[0]) covered_cids.add(cid) new_c = np.ones(g.num_vertices()) * (-1) new_c[list(new_infected_nodes)] = 1 if return_new_edges: raise Exception("`return_new_edges` not supported anymore") else: return new_c
def add_incremental_edges(self, tree_nodes): if isinstance(tree_nodes, GraphView): raise TypeError('add_incremental_edges does not support GraphView yet. ' + 'Please pass in a set of nodes') fake_c = np.ones(self.num_nodes) * (-1) fake_c[list(tree_nodes)] = 1 edge_weights = get_edge_weights(self.g) assert edge_weights is not None, 'for incremental edge addition, edge weight should be given' new_c = incremental_simulation(self.g, fake_c, edge_weights, self.num_nodes, return_new_edges=False) return set(infected_nodes(new_c))
def get_infection_time(g, source, return_edges=False): """for IC model """ time, pred_map = shortest_distance(g, source=source, pred_map=True) time = np.array(time.a) time[time == MAXINT] = -1 if return_edges: edges = [] reached = infected_nodes(time) for v in reached: # print(v) if pred_map[v] >= 0 and pred_map[v] != v: edges.append((pred_map[v], v)) return time, edges else: return time
def run_with_or_without_resampling(g, cid, c, X, n_samples, sampling_method): gi = from_gt(g, get_edge_weights(g)) infected = infected_nodes(c) y_true = np.zeros((len(c), )) y_true[infected] = 1 X_set = set(X) mask = np.array([(i not in X_set) for i in range(len(c))]) root_sampler = build_true_root_sampler(c) options = { 'P': { 'with_resampling': True, 'true_casacde_proba_func': cascade_probability_gt }, 'P_new': { 'with_resampling': True, 'true_casacde_proba_func': ic_cascade_probability_gt }, 'no resampling': { 'with_resampling': False } } ap_ans, p_ans = {}, {} for name, opt in options.items(): sampler = TreeSamplePool(g, n_samples, sampling_method, gi=gi, return_type='nodes', **opt) sampler.fill(X, root_sampler=root_sampler) estimator = TreeBasedStatistics(g, sampler.samples) probas = infection_probability(g, X, sampler, estimator) ap_score = average_precision_score(y_true[mask], probas[mask]) p_score = precision_at_cascade_size(y_true[mask], probas[mask]) # print('with_resampling={}, AP score={}'.format(opt, score)) ap_ans[name] = ap_score p_ans[name] = p_score ap_ans['cid'] = cid p_ans['cid'] = cid # print(ans) return ap_ans, p_ans
def heatmap_plot_setting(g, c, X, weight, **kwargs): inf_nodes = infected_nodes(c) hidden_infs = set(inf_nodes) - set(X) multipler = kwargs.get('size_multiplier', 1.0) s = default_plot_setting(g, c, X, **kwargs) if False: s['node_size_info'][tuple(X)] = 15 s['node_size_info'][tuple(hidden_infs)] = 15 s['node_size_info']['default'] = 7.5 else: s['node_size_info'][tuple(X)] = 10 * multipler s['node_size_info'][tuple(hidden_infs)] = 10 * multipler s['node_size_info']['default'] = 10 * multipler s['node_color_info'] = weight return s
def default_plot_setting(g, c, X, size_multiplier=1.0, edge_width_multiplier=1.0, deemphasize_hidden_infs=False): source = cascade_source(c) inf_nodes = infected_nodes(c) hidden_infs = set(inf_nodes) - set(X) node_color_info = OrderedDict() node_color_info[tuple(X)] = COLOR_BLUE if not deemphasize_hidden_infs: # print(COLOR_DARK_RED) node_color_info[tuple(hidden_infs)] = COLOR_YELLOW node_color_info[(source, )] = COLOR_GREEN node_color_info['default'] = COLOR_WHITE node_shape_info = OrderedDict() node_shape_info[tuple(X)] = SHAPE_SQUARE node_shape_info['default'] = SHAPE_CIRCLE node_shape_info[(source, )] = SHAPE_PENTAGON node_size_info = OrderedDict() node_size_info[tuple(X)] = 15 * size_multiplier node_size_info[(source, )] = 20 * size_multiplier if not deemphasize_hidden_infs: node_size_info[tuple(hidden_infs)] = 12.5 * size_multiplier node_size_info['default'] = 6 * size_multiplier node_text_info = {'default': ''} edge_color_info = {'default': 'white'} edge_pen_width_info = {'default': 2.0 * edge_width_multiplier} return { 'node_color_info': node_color_info, 'node_shape_info': node_shape_info, 'node_size_info': node_size_info, 'edge_color_info': edge_color_info, 'edge_pen_width_info': edge_pen_width_info, 'node_text_info': node_text_info }
def accumulate_score(stuff, eval_func): scores_by_root_sampling_method = {} for root_sampling_method, data in stuff.items(): scores_by_root_sampling_method[root_sampling_method] = [] for row in tqdm(data): c, obs = row['c'], row['obs'] inf_nodes = infected_nodes(c) y_true = np.zeros((len(c), )) y_true[inf_nodes] = 1 mask = np.array([(i not in obs) for i in range(len(c))]) score = {} # names = ['random', 'st_naive', 'st_inc'] names = ['random', 'st_naive'] random_inf_p = np.random.random(g.num_vertices()) for name, inf_probas in zip( names, [random_inf_p, row['st_naive_probas']]): # row['st_tree_inc_probas']]): score[name] = eval_func(y_true[mask], inf_probas[mask]) scores_by_root_sampling_method[root_sampling_method].append(score) return scores_by_root_sampling_method
def heatmap_plot_setting(g, c, X, weight, color_mapper=None, **kwargs): inf_nodes = infected_nodes(c) hidden_infs = set(inf_nodes) - set(X) multipler = kwargs.get('size_multiplier', 1.0) s = default_plot_setting(g, c, X, **kwargs) if False: s['node_size_info'][tuple(X)] = 15 s['node_size_info'][tuple(hidden_infs)] = 15 s['node_size_info']['default'] = 7.5 else: s['node_size_info'][tuple(X)] = 10 * multipler s['node_size_info'][tuple(hidden_infs)] = 10 * multipler s['node_size_info']['default'] = 10 * multipler if color_mapper is None: s['node_color_info'] = weight else: s['node_color_info'] = {} for n, p in enumerate(weight): s['node_color_info'][(n, )] = color_mapper(p) return s
def test_gen_input(g, cascade_model, weighted, source): if weighted: p = g.edge_properties['weights'] else: p = g.new_edge_property('float') p.set_value(0.8) # print(cascade_model, weighted, source) rows = [gen_input(g, p=p, model=cascade_model, source=source, stop_fraction=0.1) for i in range(10)] # make sure no two cascades are the same # with low probability, this fails for r1, r2 in combinations(rows, 2): obs1, c1 = r1[:2] obs2, c2 = r2[:2] assert set(obs1) != set(obs2) # check for cascade size # only applicable for SI model if cascade_model == 'si': for r in rows: c = r[1] frac = len(infected_nodes(c)) / g.num_vertices() assert frac <= 0.11
def one_run(g, norm_g, q, eps, root_sampler_name, min_size, max_size, observation_method="uniform", with_inc=False): print("observation_method", observation_method) n_samples = 100 p = g.edge_properties['weights'] obs, c = gen_input(g, source=None, p=p, q=q, model='ic', observation_method=observation_method, min_size=min_size, max_size=max_size) print('cascade size', len(infected_nodes(c))) # inf_nodes = infected_nodes(c) source = np.nonzero(c == 0)[0][0] if root_sampler_name == 'pagerank': root_sampler = build_root_sampler_by_pagerank_score(g, obs, c, eps=eps) elif root_sampler_name == 'true': root_sampler = (lambda: source) else: root_sampler = (lambda: None) # method 2: # vanilla steiner tree sampling gi = from_gt(norm_g, weights=get_edge_weights(norm_g)) st_tree_nodes = sample_steiner_trees(g, obs, root=root_sampler(), method='cut', n_samples=n_samples, gi=gi, return_tree_nodes=True) node_stat = TreeBasedStatistics(g, st_tree_nodes) st_naive_probas = node_stat.unconditional_proba() if with_inc: # method 3 # with incremental cascade simulation st_tree_nodes = sample_steiner_trees(g, obs, root=root_sampler(), method='cut', n_samples=n_samples, gi=gi, return_tree_nodes=True) new_tree_nodes = [] for nodes in st_tree_nodes: fake_c = np.ones(g.num_vertices()) * (-1) fake_c[list(nodes)] = 1 new_c = incremental_simulation(g, fake_c, p, return_new_edges=False) new_tree_nodes.append(infected_nodes(new_c)) node_stat = TreeBasedStatistics(g, new_tree_nodes) st_tree_inc_probas = node_stat.unconditional_proba() # y_true = np.zeros((len(c), )) # y_true[inf_nodes] = 1 # mask = np.array([(i not in obs) for i in range(len(c))]) row = {'c': c, 'obs': obs, 'st_naive_probas': st_naive_probas} if with_inc: row['st_tree_inc_probas'] = st_tree_inc_probas # # for inf_probas in [brute_force_inf_probas, st_naive_probas, st_tree_inc_probas]: # for inf_probas in [st_naive_probas, st_tree_inc_probas]: # row.append(average_precision_score(y_true[mask], inf_probas[mask])) return row
cascade_fraction, obs_frac) print(dirname) g = load_graph_by_name(graph, weighted=True, suffix=suffix) gprop = g.graph_properties if 'p_min' in gprop: p_min, p_max = gprop['p_min'], gprop['p_max'] print('p_min={}, p_max={}'.format(p_min, p_max)) else: print('external weight initialization') os = [pkl.load(open(p, 'rb'))[0] for p in glob(dirname)] cs = [pkl.load(open(p, 'rb'))[1] for p in glob(dirname)] obs_sizes = [len(o) for o in os] c_sizes = [len(infected_nodes(c)) for c in cs] roots = list(map(cascade_source, cs)) print('roots freq:') print(Counter(roots).most_common(10)) obs_cnt = Counter([tuple(sorted(o)) for o in os]) print('top cascade freq:') for _, c in obs_cnt.most_common(10): print('freq:', c) print('cascade size describe:') print(pd.Series(c_sizes).describe()) print('-' * 10) print('fraction', np.mean(c_sizes) / g.num_vertices()) print('-' * 10)