Example #1
0
    def setUp(self):
        random.seed(123456)

        tree = nx.DiGraph()
        tree.add_edges_from([
            (0, 1), (1, 2), (1, 3), (2, 4), (2, 5),
            (0, 6), (6, 7), (0, 8)
        ])
        self.assign_g_attrs(tree)

        self.tree = tree
        for t, nodes in enumerate([(0, ), (1, 6, 8), (2, 3, 7), (4, 5)]):
            for n in nodes:
                tree.node[n]['datetime'] = t

        self.s = AdaptiveSampler(self.tree, B=3,
                                 timespan_secs=1,
                                 node_score_func=lambda p, c: p**2 / c)
Example #2
0
class AdaptiveSamplerTest(unittest.TestCase):
    def setUp(self):
        random.seed(123456)

        tree = nx.DiGraph()
        tree.add_edges_from([
            (0, 1), (1, 2), (1, 3), (2, 4), (2, 5),
            (0, 6), (6, 7), (0, 8)
        ])
        self.assign_g_attrs(tree)

        self.tree = tree
        for t, nodes in enumerate([(0, ), (1, 6, 8), (2, 3, 7), (4, 5)]):
            for n in nodes:
                tree.node[n]['datetime'] = t

        self.s = AdaptiveSampler(self.tree, B=3,
                                 timespan_secs=1,
                                 node_score_func=lambda p, c: p**2 / c)

    def assign_g_attrs(self, tree):
        for s, t in tree.edges_iter():
            tree[s][t]['c'] = 1
        for n in tree.nodes_iter():
            tree.node[n]['r'] = 1

    def test_sampler_init(self):
        assert_equal(
            # {0: 4, 1: 3, 2: 3, 6: 2},
            [0, 2, 1, 6],
            self.s.roots_sorted_by_upperbound
        )
        assert_equal(
            1.0,
            self.s.explore_proba
            )

        assert_equal(4, self.s.n_nodes_to_cover)

    def test_update(self):
        result_tree = nx.DiGraph()
        result_tree.add_edges_from(
            [(0, 1), (0, 6), (1, 3)]
        )
        self.assign_g_attrs(result_tree)

        self.s.update(0, result_tree)
        assert_equal(
            0.5,
            self.s.explore_proba
        )

        assert_equal(
            set([0, 1]),
            self.s.covered_nodes
        )
        assert_equal(
            {1: 2 ** 2},
            self.s.node2score
        )

        # case: score of node 1 increases
        result_tree.add_edge(1, 2)
        self.assign_g_attrs(result_tree)

        self.s.update(0, result_tree)
        assert_equal(
            {1: 3 ** 2 / 2},
            self.s.node2score
        )

    def test_update_border_case(self):
        self.s.update(0, self.tree)
        assert_equal(
            set([0, 1, 2, 6]),
            self.s.covered_nodes
        )

        assert_equal(0,
                     self.s.explore_proba)

    def test_explore_proba(self):
        assert_equal(1, self.s.explore_proba)

        result_tree = nx.DiGraph()
        result_tree.add_edges_from(
            [(0, 1), (0, 6), (1, 3)]
        )
        self.assign_g_attrs(result_tree)
        self.s.update(0, result_tree)

        assert_almost_equal(2 / 4., self.s.explore_proba)

    def test_take_via_explore(self):
        r, tree = self.s.take()
        assert_equal('explore',
                     self.s.random_action())
        assert_equal(0, r)
        assert_equal(
            sorted([(0, 1), (0, 6), (0, 8)]),
            sorted(tree.edges())
        )
        
        # on and on
        r, tree = self.s.take()
        assert_equal(2, r)

        r, tree = self.s.take()
        assert_equal(1, r)

    def test_take_via_exploit(self):
        # round 1
        self.s.update(0, self.tree)
        assert_equal('exploit',
                     self.s.random_action())
        r, tree = self.s.take()
        assert_equal(1, r)

        # round 2
        self.s.update(r, tree)
        assert_true(r not in self.s.node2score)
def run(
        gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {
                'topics': 0.2,
                'bow': 0.8
            },
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']

    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))

    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path))
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path))
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix)
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[
                'preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied)

        logger.info('pickling...')
        nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False),
                         meta_graph_pkl_path)
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)

    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(), cand_tree_number, cand_tree_percent)
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)

    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(cand_tree_number /
                                                float(g.number_of_nodes())))

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)

        start = datetime.now()
        tree = calc_tree(i,
                         root,
                         dag,
                         U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() -
                                          start).total_seconds()

        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix, experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(cand_tree_percent=cand_tree_percent,
                                 root_sampling=root_sampling_method), suffix)

    result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path + '.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)

    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {
        'interactions': interaction_path,
        'meta_graph': meta_graph_pkl_path,
        'result': result_pkl_path,
        'true_events': true_events_path,
        'self': all_paths_pkl_path
    }
    pickle.dump(paths_dict, open(all_paths_pkl_path, 'w'))
    return paths_dict