def test_get_rank_index():
    array = [0, 0, 0, 1, 0, 0]
    id_ = 2
    assert np.where(np.argsort(array)[::-1] == id_)[0][0] != 1
    assert get_rank_index(array, id_) == 3

    array = [0, 1]
    id_ = 0
    assert get_rank_index(array, id_) == 1

    array = [0, 1, 0, 0, 0]
    id_ = 0
    assert get_rank_index(array, id_) == 2
def test_full_observation_tree_region_mst(tree_and_cascade):
    g = tree_and_cascade[0]
    for p in np.arange(0.2, 1.0, 0.1):
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0)
        scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source,
                                     method='region_mst')
        assert get_rank_index(scores, source) == 0
Beispiel #3
0
def test_best_tree_sizes_grid_tbfs(grid_and_cascade):
    g, _, infection_times, source, obs_nodes = grid_and_cascade
    scores = tree_sizes_by_roots(g,
                                 obs_nodes,
                                 infection_times,
                                 source,
                                 method='tbfs')
    assert get_rank_index(scores, source) <= 10
def test_full_observation_grid_region_mst(grid_and_cascade):
    g = grid_and_cascade[0]
    for p in np.arange(0.5, 1.0, 0.1):
        print('p={}'.format(p))
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0)
        scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source,
                                     method='region_mst')
        assert get_rank_index(scores, source) <= 0
def test_best_tree_sizes_grid_closure(grid_and_cascade):
    g, infection_times, source, obs_nodes = grid_and_cascade
    scores = tree_sizes_by_roots(g,
                                 obs_nodes,
                                 infection_times,
                                 source,
                                 method='closure')
    assert get_rank_index(
        scores, source
    ) <= 10  # make sure it runs, how can we assume the source's rank?
def test_full_observation_tree_closure(tree_and_cascade):
    g = tree_and_cascade[0]
    for p in np.arange(0.2, 1.0, 0.1):
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0)
        scores = tree_sizes_by_roots(g,
                                     obs_nodes,
                                     infection_times,
                                     source,
                                     method='closure')
        assert get_rank_index(scores, source) == 0
Beispiel #7
0
def test_full_observation_grid_tbfs(grid_and_cascade):
    g = grid_and_cascade[0]
    for p in np.arange(0.5, 1.0, 0.1):
        print('p={}'.format(p))
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0)
        scores = tree_sizes_by_roots(g,
                                     obs_nodes,
                                     infection_times,
                                     source,
                                     method='tbfs')
        assert get_rank_index(scores, source) <= 1.0
def source_likelihood_stat(g,
                           gvs,
                           p,
                           q,
                           N1,
                           estimation_method,
                           precond_method,
                           eps,
                           debug=True):
    sll_array = []
    sources = []
    dist_array = []

    if debug:
        iters = tqdm(range(N1))
    else:
        iters = range(N1)

    for i in iters:
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, q)
        sources.append(source)
        if estimation_method == 'steiner-tree-exact':
            if debug:
                print('using steiner tree exact')
            sll = best_tree_sizes(g, obs_nodes, infection_times)
        else:
            if debug:
                print(
                    'using steiner tree order ({})'.format(estimation_method))
            sll = tree_sizes_by_roots(g,
                                      obs_nodes,
                                      infection_times,
                                      source,
                                      method=estimation_method)

        winner = np.argmax(sll)
        dist_to_max_n = shortest_distance(g, source=source, target=winner)
        dist_array.append(dist_to_max_n)
        sll_array.append(sll)

    source_likelihood_array = np.array(sll_array, dtype=np.float64)
    source_llh = np.array(
        [source_likelihood_array[i, src] for i, src in enumerate(sources)])
    ranks = np.array([
        get_rank_index(source_likelihood_array[i, :], src)
        for i, src in enumerate(sources)
    ])

    return {
        'dist': pd.Series(dist_array).describe(),
        'mu[s]': pd.Series(source_llh).describe(),
        'rank': pd.Series(ranks).describe(),
    }
def source_likelihood_stat(g,
                           gvs, p, q, N1,
                           estimation_method,
                           precond_method,
                           eps,
                           debug=True):
    sll_array = []
    sources = []
    dist_array = []

    if debug:
        iters = tqdm(range(N1))
    else:
        iters = range(N1)

    for i in iters:
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, q)
        sources.append(source)
        if estimation_method == 'steiner-tree-exact':
            if debug:
                print('using steiner tree exact')
            sll = best_tree_sizes(g, obs_nodes, infection_times)
        else:
            if debug:
                print('using steiner tree order ({})'.format(estimation_method))
            sll = tree_sizes_by_roots(g, obs_nodes, infection_times, source,
                                      method=estimation_method)

        winner = np.argmax(sll)
        dist_to_max_n = shortest_distance(g, source=source, target=winner)
        dist_array.append(dist_to_max_n)
        sll_array.append(sll)

    source_likelihood_array = np.array(sll_array, dtype=np.float64)
    source_llh = np.array([source_likelihood_array[i, src]
                           for i, src in enumerate(sources)])
    ranks = np.array([get_rank_index(source_likelihood_array[i, :], src)
                      for i, src in enumerate(sources)])

    return {
        'dist': pd.Series(dist_array).describe(),
        'mu[s]': pd.Series(source_llh).describe(),
        'rank': pd.Series(ranks).describe(),
    }
Beispiel #10
0
def test_best_tree_sizes_tree(tree_and_cascade):
    g, _, infection_times, source, obs_nodes = tree_and_cascade
    scores = best_tree_sizes(g, obs_nodes, infection_times)
    print('|possible_nodes|={}'.format(np.sum(np.invert(np.isinf(scores)))))
    print(scores[source], scores.min())
    assert get_rank_index(scores, source) <= 3
def mwu(g, gvs,
        source, obs_nodes, infection_times, o2src_time=None,
        active_method=MAX_MU,
        reward_method='exact',
        eps=0.2,
        max_iter=float('inf'),
        use_uninfected=True,
        debug=False,
        save_log=False):
    if save_log:
        query_log = []
        sll_log = []
        is_nbr_log = []
    if o2src_time is None:
        o2src_time = get_o2src_time(obs_nodes, gvs, debug=debug)

    if reward_method == 'time-diff':
        sp_len_dict = {o: shortest_distance(g, source=o).a for o in obs_nodes}
    else:
        sp_len_dict = None
    # init
    sll = sll_using_pairs(
        g,
        obs_nodes,
        infection_times,
        o2src_time,
        sp_len_dict=sp_len_dict,
        source=source,
        method=reward_method,
        eps=eps,
        precond_method='and',
        return_cascade=False,
        debug=debug)
        
    iter_i = 0
    all_nodes = set(np.arange(g.num_vertices()))
    unqueried_nodes = all_nodes - set(obs_nodes)

    obs_nodes = copy(obs_nodes)

    queried_nodes = set()

    # reference nodes to use for MWU,
    # required to be **infected**
    ref_nodes = set(obs_nodes)

    nodes_to_use = []  # nodes coming from querying the neighbors

    while iter_i < max_iter:
        iter_i += 1
        if len(unqueried_nodes) == 0:
            print('no more nodes to query')
            break
        if len(nodes_to_use) == 0:
            if active_method == MAX_MU:
                q = max(unqueried_nodes, key=lambda n: sll[n])
            elif active_method == RANDOM:
                q = random.choice(list(unqueried_nodes))
            else:
                raise ValueError('available query methods are {}'.format(MAX_MU))

            if debug:
                print('query {}'.format(q))
            queried_nodes.add(q)
            unqueried_nodes.remove(q)
            if save_log:
                query_log.append(q)
                is_nbr_log.append(False)
        else:
            if debug:
                print('using node from nodes_to_use')
            q = nodes_to_use.pop()
        q = int(q)
        if infection_times[q] == -1 and use_uninfected:
            # the query is uninfected
            if debug:
                print('{} is uninfected'.format(q))
            probas = get_reward_for_uninfected_query(q, gvs)
            sll *= (eps + (1-eps) * probas)
            if np.isclose(sll.sum(), 0):
                print('warning: sll.sum() close to 0')
                sll = np.ones(g.num_vertices()) / g.num_vertices()
            else:
                sll /= sll.sum()
        else:
            if debug:
                print('using pairs to update sll')
            # the query is infected
            if reward_method == 'time-diff':
                sp_len_dict[q] = shortest_distance(g, source=q).a

            o2src_time[q] = np.array([get_infection_time(gv, q) for gv in gvs])

            for o in ref_nodes:
                probas = None
                tq, to = infection_times[q], infection_times[o]
                dists_q, dists_o = o2src_time[q], o2src_time[o]
                mask = np.logical_and(dists_q != -1, dists_o != -1)

                if reward_method == 'time-exact':
                    probas = exact_rewards(tq, to, dists_q, dists_o, mask)
                elif reward_method == 'time-order':
                    probas = order_rewards(tq, to, dists_q, dists_o, mask)
                elif reward_method == 'time-diff':
                    try:
                        probas = dist_rewards(
                            tq, to,
                            dists_q, dists_o,
                            sp_len_dict[q], sp_len_dict[o],
                            mask)
                    except ValueError:
                        # zero-size array to reduction operation maximum which has no identity
                        # or max_penalty = 0
                        # ignore this iteration
                        continue
                else:
                    raise ValueError('methoder is unknown')

                probas[np.isnan(probas)] = 0

                if debug and probas is not None:
                    print('source reward (without smoothing): {:.2f}'.format(probas[source]))
                    print('max reward: {}'.format(np.max(probas)))
                    # print('probas {}'.format(probas[:10]))

                sll *= (eps + (1-eps) * probas)
                if np.isclose(sll.sum(), 0):
                    print('warning: sll.sum() close to 0')
                    sll = np.ones(g.num_vertices()) / g.num_vertices()
                else:
                    sll /= sll.sum()

                if debug:
                    print('new sll[source] = {}'.format(sll[source]))

            if debug:
                if np.isclose(sll[source], 0):
                    print('warning: source sll is 0!!')

            # if the query node infection time is larger than
            # the current known earliest infection,
            # it cannot be the source
            min_inf_t = min(infection_times[n] for n in ref_nodes)
            if (infection_times[q] == -1 or
                infection_times[q] > min_inf_t):
                sll[q] = 0

            # when q is used for updating sll, add it to reference list
            ref_nodes.add(q)
            if debug:
                print('add q to ref_nodes (#nodes={})'.format(len(ref_nodes)))

        if save_log:
            sll_log.append(sll)

        if debug:
            print('source current rank = {}, {:.5f}'.format(get_rank_index(sll, source), sll[source]))
        
        # if some node has very large mu
        # query its neighbors
        winners = np.nonzero(sll == sll.max())[0]
        for w in winners:
            nbrs = set(map(int, g.vertex(w).all_neighbours()))
            unqueried_neighbors = nbrs - queried_nodes
            nodes_to_use += list(unqueried_neighbors)
            queried_nodes |= unqueried_neighbors

            if save_log:
                query_log += list(unqueried_neighbors)
                is_nbr_log += [True] * len(unqueried_neighbors)

            if infection_times[w] != -1:
                is_source = np.all([(infection_times[w] < infection_times[int(u)])
                                    for u in nbrs
                                    if infection_times[int(u)] != -1])
            else:
                is_source = False
                continue

            if debug:
                print('checking source {} with winner {}'.format(source, w))
                print('winner\'s time {}'.format(infection_times[w]))
                print('winner\'s nbr infection time {}'.format([infection_times[int(u)] for u in nbrs]))

            if is_source:
                query_count = len(queried_nodes)
                if debug:
                    print('**Found source and used {} queries'.format(query_count))
                assert source == w
                if save_log:
                    return query_count, query_log, sll_log, is_nbr_log
                else:
                    return query_count
            else:
                sll[w] = 0

    query_count = len(queried_nodes)
    if save_log:
        return query_count, query_log, sll_log, is_nbr_log
    else:
        return query_count
def test_best_tree_sizes_grid(grid_and_cascade):
    g, _, infection_times, source, obs_nodes = grid_and_cascade
    scores = best_tree_sizes(g, obs_nodes, infection_times)
    assert get_rank_index(scores, source) <= 1
def test_best_tree_sizes_grid_region_mst(grid_and_cascade):
    g, _, infection_times, source, obs_nodes = grid_and_cascade
    scores = tree_sizes_by_roots(g, obs_nodes, infection_times, source,
                                 method='region_mst')
    assert get_rank_index(scores, source) <= 1
def mwu(g,
        gvs,
        source,
        obs_nodes,
        infection_times,
        o2src_time=None,
        active_method=MAX_MU,
        reward_method='exact',
        eps=0.2,
        max_iter=float('inf'),
        use_uninfected=True,
        debug=False,
        save_log=False):
    if save_log:
        query_log = []
        sll_log = []
        is_nbr_log = []
    if o2src_time is None:
        o2src_time = get_o2src_time(obs_nodes, gvs, debug=debug)

    if reward_method == 'time-diff':
        sp_len_dict = {o: shortest_distance(g, source=o).a for o in obs_nodes}
    else:
        sp_len_dict = None
    # init
    sll = sll_using_pairs(g,
                          obs_nodes,
                          infection_times,
                          o2src_time,
                          sp_len_dict=sp_len_dict,
                          source=source,
                          method=reward_method,
                          eps=eps,
                          precond_method='and',
                          return_cascade=False,
                          debug=debug)

    iter_i = 0
    all_nodes = set(np.arange(g.num_vertices()))
    unqueried_nodes = all_nodes - set(obs_nodes)

    obs_nodes = copy(obs_nodes)

    queried_nodes = set()

    # reference nodes to use for MWU,
    # required to be **infected**
    ref_nodes = set(obs_nodes)

    nodes_to_use = []  # nodes coming from querying the neighbors

    while iter_i < max_iter:
        iter_i += 1
        if len(unqueried_nodes) == 0:
            print('no more nodes to query')
            break
        if len(nodes_to_use) == 0:
            if active_method == MAX_MU:
                q = max(unqueried_nodes, key=lambda n: sll[n])
            elif active_method == RANDOM:
                q = random.choice(list(unqueried_nodes))
            else:
                raise ValueError(
                    'available query methods are {}'.format(MAX_MU))

            if debug:
                print('query {}'.format(q))
            queried_nodes.add(q)
            unqueried_nodes.remove(q)
            if save_log:
                query_log.append(q)
                is_nbr_log.append(False)
        else:
            if debug:
                print('using node from nodes_to_use')
            q = nodes_to_use.pop()
        q = int(q)
        if infection_times[q] == -1 and use_uninfected:
            # the query is uninfected
            if debug:
                print('{} is uninfected'.format(q))
            probas = get_reward_for_uninfected_query(q, gvs)
            sll *= (eps + (1 - eps) * probas)
            if np.isclose(sll.sum(), 0):
                print('warning: sll.sum() close to 0')
                sll = np.ones(g.num_vertices()) / g.num_vertices()
            else:
                sll /= sll.sum()
        else:
            if debug:
                print('using pairs to update sll')
            # the query is infected
            if reward_method == 'time-diff':
                sp_len_dict[q] = shortest_distance(g, source=q).a

            o2src_time[q] = np.array([get_infection_time(gv, q) for gv in gvs])

            for o in ref_nodes:
                probas = None
                tq, to = infection_times[q], infection_times[o]
                dists_q, dists_o = o2src_time[q], o2src_time[o]
                mask = np.logical_and(dists_q != -1, dists_o != -1)

                if reward_method == 'time-exact':
                    probas = exact_rewards(tq, to, dists_q, dists_o, mask)
                elif reward_method == 'time-order':
                    probas = order_rewards(tq, to, dists_q, dists_o, mask)
                elif reward_method == 'time-diff':
                    try:
                        probas = dist_rewards(tq, to, dists_q, dists_o,
                                              sp_len_dict[q], sp_len_dict[o],
                                              mask)
                    except ValueError:
                        # zero-size array to reduction operation maximum which has no identity
                        # or max_penalty = 0
                        # ignore this iteration
                        continue
                else:
                    raise ValueError('methoder is unknown')

                probas[np.isnan(probas)] = 0

                if debug and probas is not None:
                    print('source reward (without smoothing): {:.2f}'.format(
                        probas[source]))
                    print('max reward: {}'.format(np.max(probas)))
                    # print('probas {}'.format(probas[:10]))

                sll *= (eps + (1 - eps) * probas)
                if np.isclose(sll.sum(), 0):
                    print('warning: sll.sum() close to 0')
                    sll = np.ones(g.num_vertices()) / g.num_vertices()
                else:
                    sll /= sll.sum()

                if debug:
                    print('new sll[source] = {}'.format(sll[source]))

            if debug:
                if np.isclose(sll[source], 0):
                    print('warning: source sll is 0!!')

            # if the query node infection time is larger than
            # the current known earliest infection,
            # it cannot be the source
            min_inf_t = min(infection_times[n] for n in ref_nodes)
            if (infection_times[q] == -1 or infection_times[q] > min_inf_t):
                sll[q] = 0

            # when q is used for updating sll, add it to reference list
            ref_nodes.add(q)
            if debug:
                print('add q to ref_nodes (#nodes={})'.format(len(ref_nodes)))

        if save_log:
            sll_log.append(sll)

        if debug:
            print('source current rank = {}, {:.5f}'.format(
                get_rank_index(sll, source), sll[source]))

        # if some node has very large mu
        # query its neighbors
        winners = np.nonzero(sll == sll.max())[0]
        for w in winners:
            nbrs = set(map(int, g.vertex(w).all_neighbours()))
            unqueried_neighbors = nbrs - queried_nodes
            nodes_to_use += list(unqueried_neighbors)
            queried_nodes |= unqueried_neighbors

            if save_log:
                query_log += list(unqueried_neighbors)
                is_nbr_log += [True] * len(unqueried_neighbors)

            if infection_times[w] != -1:
                is_source = np.all([
                    (infection_times[w] < infection_times[int(u)])
                    for u in nbrs if infection_times[int(u)] != -1
                ])
            else:
                is_source = False
                continue

            if debug:
                print('checking source {} with winner {}'.format(source, w))
                print('winner\'s time {}'.format(infection_times[w]))
                print('winner\'s nbr infection time {}'.format(
                    [infection_times[int(u)] for u in nbrs]))

            if is_source:
                query_count = len(queried_nodes)
                if debug:
                    print('**Found source and used {} queries'.format(
                        query_count))
                assert source == w
                if save_log:
                    return query_count, query_log, sll_log, is_nbr_log
                else:
                    return query_count
            else:
                sll[w] = 0

    query_count = len(queried_nodes)
    if save_log:
        return query_count, query_log, sll_log, is_nbr_log
    else:
        return query_count
def test_full_observation_grid(grid_and_cascade):
    g = grid_and_cascade[0]
    for p in np.arange(0.5, 1.0, 0.1):
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0)
        scores = best_tree_sizes(g, obs_nodes, infection_times)
        assert get_rank_index(scores, source) == 0
Beispiel #16
0
def test_best_tree_sizes_grid(grid_and_cascade):
    g, _, infection_times, source, obs_nodes = grid_and_cascade
    scores = best_tree_sizes(g, obs_nodes, infection_times)
    assert get_rank_index(scores, source) <= 1
Beispiel #17
0
def test_full_observation_grid(grid_and_cascade):
    g = grid_and_cascade[0]
    for p in np.arange(0.5, 1.0, 0.1):
        infection_times, source, obs_nodes = gen_nontrivial_cascade(g, p, 1.0)
        scores = best_tree_sizes(g, obs_nodes, infection_times)
        assert get_rank_index(scores, source) == 0
def test_best_tree_sizes_tree(tree_and_cascade):
    g, _, infection_times, source, obs_nodes = tree_and_cascade
    scores = best_tree_sizes(g, obs_nodes, infection_times)
    print('|possible_nodes|={}'.format(np.sum(np.invert(np.isinf(scores)))))
    print(scores[source], scores.min())
    assert get_rank_index(scores, source) <= 3