def test_get_rank_index(): array = [0, 0, 0, 1, 0, 0] id_ = 2 assert np.where(np.argsort(array)[::-1] == id_)[0][0] != 1 assert get_rank_index(array, id_) == 3 array = [0, 1] id_ = 0 assert get_rank_index(array, id_) == 1 array = [0, 1, 0, 0, 0] id_ = 0 assert get_rank_index(array, id_) == 2
def test_full_observation_tree_region_mst(tree_and_cascade): g = tree_and_cascade[0] for p in np.arange(0.2, 1.0, 0.1): infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0) scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='region_mst') assert get_rank_index(scores, source) == 0
def test_best_tree_sizes_grid_tbfs(grid_and_cascade): g, _, infection_times, source, obs_nodes = grid_and_cascade scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='tbfs') assert get_rank_index(scores, source) <= 10
def test_full_observation_grid_region_mst(grid_and_cascade): g = grid_and_cascade[0] for p in np.arange(0.5, 1.0, 0.1): print('p={}'.format(p)) infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0) scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='region_mst') assert get_rank_index(scores, source) <= 0
def test_best_tree_sizes_grid_closure(grid_and_cascade): g, infection_times, source, obs_nodes = grid_and_cascade scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='closure') assert get_rank_index( scores, source ) <= 10 # make sure it runs, how can we assume the source's rank?
def test_full_observation_tree_closure(tree_and_cascade): g = tree_and_cascade[0] for p in np.arange(0.2, 1.0, 0.1): infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0) scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='closure') assert get_rank_index(scores, source) == 0
def test_full_observation_grid_tbfs(grid_and_cascade): g = grid_and_cascade[0] for p in np.arange(0.5, 1.0, 0.1): print('p={}'.format(p)) infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0) scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='tbfs') assert get_rank_index(scores, source) <= 1.0
def source_likelihood_stat(g, gvs, p, q, N1, estimation_method, precond_method, eps, debug=True): sll_array = [] sources = [] dist_array = [] if debug: iters = tqdm(range(N1)) else: iters = range(N1) for i in iters: infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, q) sources.append(source) if estimation_method == 'steiner-tree-exact': if debug: print('using steiner tree exact') sll = best_tree_sizes(g, obs_nodes, infection_times) else: if debug: print( 'using steiner tree order ({})'.format(estimation_method)) sll = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method=estimation_method) winner = np.argmax(sll) dist_to_max_n = shortest_distance(g, source=source, target=winner) dist_array.append(dist_to_max_n) sll_array.append(sll) source_likelihood_array = np.array(sll_array, dtype=np.float64) source_llh = np.array( [source_likelihood_array[i, src] for i, src in enumerate(sources)]) ranks = np.array([ get_rank_index(source_likelihood_array[i, :], src) for i, src in enumerate(sources) ]) return { 'dist': pd.Series(dist_array).describe(), 'mu[s]': pd.Series(source_llh).describe(), 'rank': pd.Series(ranks).describe(), }
def source_likelihood_stat(g, gvs, p, q, N1, estimation_method, precond_method, eps, debug=True): sll_array = [] sources = [] dist_array = [] if debug: iters = tqdm(range(N1)) else: iters = range(N1) for i in iters: infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, q) sources.append(source) if estimation_method == 'steiner-tree-exact': if debug: print('using steiner tree exact') sll = best_tree_sizes(g, obs_nodes, infection_times) else: if debug: print('using steiner tree order ({})'.format(estimation_method)) sll = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method=estimation_method) winner = np.argmax(sll) dist_to_max_n = shortest_distance(g, source=source, target=winner) dist_array.append(dist_to_max_n) sll_array.append(sll) source_likelihood_array = np.array(sll_array, dtype=np.float64) source_llh = np.array([source_likelihood_array[i, src] for i, src in enumerate(sources)]) ranks = np.array([get_rank_index(source_likelihood_array[i, :], src) for i, src in enumerate(sources)]) return { 'dist': pd.Series(dist_array).describe(), 'mu[s]': pd.Series(source_llh).describe(), 'rank': pd.Series(ranks).describe(), }
def test_best_tree_sizes_tree(tree_and_cascade): g, _, infection_times, source, obs_nodes = tree_and_cascade scores = best_tree_sizes(g, obs_nodes, infection_times) print('|possible_nodes|={}'.format(np.sum(np.invert(np.isinf(scores))))) print(scores[source], scores.min()) assert get_rank_index(scores, source) <= 3
def mwu(g, gvs, source, obs_nodes, infection_times, o2src_time=None, active_method=MAX_MU, reward_method='exact', eps=0.2, max_iter=float('inf'), use_uninfected=True, debug=False, save_log=False): if save_log: query_log = [] sll_log = [] is_nbr_log = [] if o2src_time is None: o2src_time = get_o2src_time(obs_nodes, gvs, debug=debug) if reward_method == 'time-diff': sp_len_dict = {o: shortest_distance(g, source=o).a for o in obs_nodes} else: sp_len_dict = None # init sll = sll_using_pairs( g, obs_nodes, infection_times, o2src_time, sp_len_dict=sp_len_dict, source=source, method=reward_method, eps=eps, precond_method='and', return_cascade=False, debug=debug) iter_i = 0 all_nodes = set(np.arange(g.num_vertices())) unqueried_nodes = all_nodes - set(obs_nodes) obs_nodes = copy(obs_nodes) queried_nodes = set() # reference nodes to use for MWU, # required to be **infected** ref_nodes = set(obs_nodes) nodes_to_use = [] # nodes coming from querying the neighbors while iter_i < max_iter: iter_i += 1 if len(unqueried_nodes) == 0: print('no more nodes to query') break if len(nodes_to_use) == 0: if active_method == MAX_MU: q = max(unqueried_nodes, key=lambda n: sll[n]) elif active_method == RANDOM: q = random.choice(list(unqueried_nodes)) else: raise ValueError('available query methods are {}'.format(MAX_MU)) if debug: print('query {}'.format(q)) queried_nodes.add(q) unqueried_nodes.remove(q) if save_log: query_log.append(q) is_nbr_log.append(False) else: if debug: print('using node from nodes_to_use') q = nodes_to_use.pop() q = int(q) if infection_times[q] == -1 and use_uninfected: # the query is uninfected if debug: print('{} is uninfected'.format(q)) probas = get_reward_for_uninfected_query(q, gvs) sll *= (eps + (1-eps) * probas) if np.isclose(sll.sum(), 0): print('warning: sll.sum() close to 0') sll = np.ones(g.num_vertices()) / g.num_vertices() else: sll /= sll.sum() else: if debug: print('using pairs to update sll') # the query is infected if reward_method == 'time-diff': sp_len_dict[q] = shortest_distance(g, source=q).a o2src_time[q] = np.array([get_infection_time(gv, q) for gv in gvs]) for o in ref_nodes: probas = None tq, to = infection_times[q], infection_times[o] dists_q, dists_o = o2src_time[q], o2src_time[o] mask = np.logical_and(dists_q != -1, dists_o != -1) if reward_method == 'time-exact': probas = exact_rewards(tq, to, dists_q, dists_o, mask) elif reward_method == 'time-order': probas = order_rewards(tq, to, dists_q, dists_o, mask) elif reward_method == 'time-diff': try: probas = dist_rewards( tq, to, dists_q, dists_o, sp_len_dict[q], sp_len_dict[o], mask) except ValueError: # zero-size array to reduction operation maximum which has no identity # or max_penalty = 0 # ignore this iteration continue else: raise ValueError('methoder is unknown') probas[np.isnan(probas)] = 0 if debug and probas is not None: print('source reward (without smoothing): {:.2f}'.format(probas[source])) print('max reward: {}'.format(np.max(probas))) # print('probas {}'.format(probas[:10])) sll *= (eps + (1-eps) * probas) if np.isclose(sll.sum(), 0): print('warning: sll.sum() close to 0') sll = np.ones(g.num_vertices()) / g.num_vertices() else: sll /= sll.sum() if debug: print('new sll[source] = {}'.format(sll[source])) if debug: if np.isclose(sll[source], 0): print('warning: source sll is 0!!') # if the query node infection time is larger than # the current known earliest infection, # it cannot be the source min_inf_t = min(infection_times[n] for n in ref_nodes) if (infection_times[q] == -1 or infection_times[q] > min_inf_t): sll[q] = 0 # when q is used for updating sll, add it to reference list ref_nodes.add(q) if debug: print('add q to ref_nodes (#nodes={})'.format(len(ref_nodes))) if save_log: sll_log.append(sll) if debug: print('source current rank = {}, {:.5f}'.format(get_rank_index(sll, source), sll[source])) # if some node has very large mu # query its neighbors winners = np.nonzero(sll == sll.max())[0] for w in winners: nbrs = set(map(int, g.vertex(w).all_neighbours())) unqueried_neighbors = nbrs - queried_nodes nodes_to_use += list(unqueried_neighbors) queried_nodes |= unqueried_neighbors if save_log: query_log += list(unqueried_neighbors) is_nbr_log += [True] * len(unqueried_neighbors) if infection_times[w] != -1: is_source = np.all([(infection_times[w] < infection_times[int(u)]) for u in nbrs if infection_times[int(u)] != -1]) else: is_source = False continue if debug: print('checking source {} with winner {}'.format(source, w)) print('winner\'s time {}'.format(infection_times[w])) print('winner\'s nbr infection time {}'.format([infection_times[int(u)] for u in nbrs])) if is_source: query_count = len(queried_nodes) if debug: print('**Found source and used {} queries'.format(query_count)) assert source == w if save_log: return query_count, query_log, sll_log, is_nbr_log else: return query_count else: sll[w] = 0 query_count = len(queried_nodes) if save_log: return query_count, query_log, sll_log, is_nbr_log else: return query_count
def test_best_tree_sizes_grid(grid_and_cascade): g, _, infection_times, source, obs_nodes = grid_and_cascade scores = best_tree_sizes(g, obs_nodes, infection_times) assert get_rank_index(scores, source) <= 1
def test_best_tree_sizes_grid_region_mst(grid_and_cascade): g, _, infection_times, source, obs_nodes = grid_and_cascade scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source, method='region_mst') assert get_rank_index(scores, source) <= 1
def mwu(g, gvs, source, obs_nodes, infection_times, o2src_time=None, active_method=MAX_MU, reward_method='exact', eps=0.2, max_iter=float('inf'), use_uninfected=True, debug=False, save_log=False): if save_log: query_log = [] sll_log = [] is_nbr_log = [] if o2src_time is None: o2src_time = get_o2src_time(obs_nodes, gvs, debug=debug) if reward_method == 'time-diff': sp_len_dict = {o: shortest_distance(g, source=o).a for o in obs_nodes} else: sp_len_dict = None # init sll = sll_using_pairs(g, obs_nodes, infection_times, o2src_time, sp_len_dict=sp_len_dict, source=source, method=reward_method, eps=eps, precond_method='and', return_cascade=False, debug=debug) iter_i = 0 all_nodes = set(np.arange(g.num_vertices())) unqueried_nodes = all_nodes - set(obs_nodes) obs_nodes = copy(obs_nodes) queried_nodes = set() # reference nodes to use for MWU, # required to be **infected** ref_nodes = set(obs_nodes) nodes_to_use = [] # nodes coming from querying the neighbors while iter_i < max_iter: iter_i += 1 if len(unqueried_nodes) == 0: print('no more nodes to query') break if len(nodes_to_use) == 0: if active_method == MAX_MU: q = max(unqueried_nodes, key=lambda n: sll[n]) elif active_method == RANDOM: q = random.choice(list(unqueried_nodes)) else: raise ValueError( 'available query methods are {}'.format(MAX_MU)) if debug: print('query {}'.format(q)) queried_nodes.add(q) unqueried_nodes.remove(q) if save_log: query_log.append(q) is_nbr_log.append(False) else: if debug: print('using node from nodes_to_use') q = nodes_to_use.pop() q = int(q) if infection_times[q] == -1 and use_uninfected: # the query is uninfected if debug: print('{} is uninfected'.format(q)) probas = get_reward_for_uninfected_query(q, gvs) sll *= (eps + (1 - eps) * probas) if np.isclose(sll.sum(), 0): print('warning: sll.sum() close to 0') sll = np.ones(g.num_vertices()) / g.num_vertices() else: sll /= sll.sum() else: if debug: print('using pairs to update sll') # the query is infected if reward_method == 'time-diff': sp_len_dict[q] = shortest_distance(g, source=q).a o2src_time[q] = np.array([get_infection_time(gv, q) for gv in gvs]) for o in ref_nodes: probas = None tq, to = infection_times[q], infection_times[o] dists_q, dists_o = o2src_time[q], o2src_time[o] mask = np.logical_and(dists_q != -1, dists_o != -1) if reward_method == 'time-exact': probas = exact_rewards(tq, to, dists_q, dists_o, mask) elif reward_method == 'time-order': probas = order_rewards(tq, to, dists_q, dists_o, mask) elif reward_method == 'time-diff': try: probas = dist_rewards(tq, to, dists_q, dists_o, sp_len_dict[q], sp_len_dict[o], mask) except ValueError: # zero-size array to reduction operation maximum which has no identity # or max_penalty = 0 # ignore this iteration continue else: raise ValueError('methoder is unknown') probas[np.isnan(probas)] = 0 if debug and probas is not None: print('source reward (without smoothing): {:.2f}'.format( probas[source])) print('max reward: {}'.format(np.max(probas))) # print('probas {}'.format(probas[:10])) sll *= (eps + (1 - eps) * probas) if np.isclose(sll.sum(), 0): print('warning: sll.sum() close to 0') sll = np.ones(g.num_vertices()) / g.num_vertices() else: sll /= sll.sum() if debug: print('new sll[source] = {}'.format(sll[source])) if debug: if np.isclose(sll[source], 0): print('warning: source sll is 0!!') # if the query node infection time is larger than # the current known earliest infection, # it cannot be the source min_inf_t = min(infection_times[n] for n in ref_nodes) if (infection_times[q] == -1 or infection_times[q] > min_inf_t): sll[q] = 0 # when q is used for updating sll, add it to reference list ref_nodes.add(q) if debug: print('add q to ref_nodes (#nodes={})'.format(len(ref_nodes))) if save_log: sll_log.append(sll) if debug: print('source current rank = {}, {:.5f}'.format( get_rank_index(sll, source), sll[source])) # if some node has very large mu # query its neighbors winners = np.nonzero(sll == sll.max())[0] for w in winners: nbrs = set(map(int, g.vertex(w).all_neighbours())) unqueried_neighbors = nbrs - queried_nodes nodes_to_use += list(unqueried_neighbors) queried_nodes |= unqueried_neighbors if save_log: query_log += list(unqueried_neighbors) is_nbr_log += [True] * len(unqueried_neighbors) if infection_times[w] != -1: is_source = np.all([ (infection_times[w] < infection_times[int(u)]) for u in nbrs if infection_times[int(u)] != -1 ]) else: is_source = False continue if debug: print('checking source {} with winner {}'.format(source, w)) print('winner\'s time {}'.format(infection_times[w])) print('winner\'s nbr infection time {}'.format( [infection_times[int(u)] for u in nbrs])) if is_source: query_count = len(queried_nodes) if debug: print('**Found source and used {} queries'.format( query_count)) assert source == w if save_log: return query_count, query_log, sll_log, is_nbr_log else: return query_count else: sll[w] = 0 query_count = len(queried_nodes) if save_log: return query_count, query_log, sll_log, is_nbr_log else: return query_count
def test_full_observation_grid(grid_and_cascade): g = grid_and_cascade[0] for p in np.arange(0.5, 1.0, 0.1): infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0) scores = best_tree_sizes(g, obs_nodes, infection_times) assert get_rank_index(scores, source) == 0