Exemple #1
0
    def test_bs_ensure_result_is_tree(self):
        params = pkl.load(
            open(make_path('test/data/quota_test_cases/params.pkl')))[0]

        root = params['roots'][0]
        preprune_secs = params['preprune_secs']
        mg = IU.get_topic_meta_graph_from_synthetic(
            make_path('test/data/quota_test_cases/interactions.json'),
            preprune_secs)
        dag = IU.get_rooted_subgraph_within_timespan(mg, root, preprune_secs)
        t = charikar_algo(dag, root, dag.nodes(), k=20, level=2)
        assert_true(nx.is_arborescence(t))
def run_with_context(interactions_path, candidate_tree_path, dirname=None, to_original_graph=False, undirected=False):
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    try:
        interactions = json.load(open(interactions_path))
    except ValueError as e:
        print(e)
        interactions = load_json_by_line(interactions_path)

    interactions = IU.clean_interactions(interactions, undirected=undirected)

    output_path = get_output_path(candidate_tree_path, dirname)

    K = 5
    events = detect_events_given_path(candidate_tree_path, K)

    contexted_events = []
    for e in events:
        context_dag = extract_event_context(interactions, e, undirected=undirected)

        if to_original_graph:
            context_dag = convert_to_original_graph(context_dag)
            e = convert_to_original_graph(e)

        contexted_events.append(add_subgraph_specific_attributes_to_graph(context_dag, [(e, {"event": True})]))
    d3_events = [to_d3_graph(ce) for ce in contexted_events]

    print("writing to {}".format(output_path))
    json_dump(d3_events, output_path)
def main():
    K, M, C, H = 'K', 'M', 'C', 'H'
    interactions = [
        {'sender_id': K, 'recipient_ids': (M, C), 'datetime': 1, 'message_id': 'K->(M, C): code(1)'},
        {'sender_id': M, 'recipient_ids': [K], 'datetime': 3, 'message_id': 'M->K: read(3)'},
        {'sender_id': K, 'recipient_ids': [M], 'datetime': 4, 'message_id': 'K->M: read(3)'},
        {'sender_id': C, 'recipient_ids': [H], 'datetime': 2, 'message_id': 'C->H: eat(2)'},
        {'sender_id': H, 'recipient_ids': [C], 'datetime': 3, 'message_id': 'H->C: eat(2)'},
    ]
    InteractionsUtil.decompose_interactions(interactions)
    node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions(
        InteractionsUtil.decompose_interactions(interactions)
    )
    graph = convert_to_meta_graph(node_names, sources, targets, time_stamps)

    print graph.edges()
    def test_bs_ensure_result_is_tree(self):
        params = pkl.load(
            open(make_path('test/data/quota_test_cases/params.pkl'))
        )[0]

        root = params['roots'][0]
        preprune_secs = params['preprune_secs']
        mg = IU.get_topic_meta_graph_from_synthetic(
            make_path('test/data/quota_test_cases/interactions.json'),
            preprune_secs            
        )
        dag = IU.get_rooted_subgraph_within_timespan(
            mg, root, preprune_secs
        )
        t = charikar_algo(dag, root, dag.nodes(),
                          k=20, level=2)
        assert_true(nx.is_arborescence(t))
    def __init__(self, g, B, timespan_secs):
        super(UBSampler, self).__init__(g, timespan_secs)
        non_leaf_roots = (n for n in g.nodes_iter() if g.out_degree(n) > 0)

        self.nodes_sorted_by_upperbound = sorted(
            non_leaf_roots,
            key=lambda r: quota_upperbound(
                IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), r,
                B),
            reverse=True)
Exemple #6
0
    def __init__(self, g, B, timespan_secs):
        super(UBSampler, self).__init__(g, timespan_secs)
        non_leaf_roots = (n for n in g.nodes_iter() if g.out_degree(n) > 0)

        self.nodes_sorted_by_upperbound = sorted(
            non_leaf_roots,
            key=lambda r: quota_upperbound(
                IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs),
                r, B),
            reverse=True
        )
def draw_example_meta_graph(output_path):
    _, _, ints = load_meta_graph_necessities()
    g = IU.get_meta_graph(ints, decompose_interactions=True)
    plt.clf()
    pos = nx.spring_layout(g)
    nx.draw_networkx_nodes(g, pos=pos)
    nx.draw_networkx_edges(g, pos=pos)
    nx.draw_networkx_labels(g,
                            pos=pos,
                            labels=dict(zip(g.nodes(), g.nodes())),
                            font_size=8)
    plt.savefig(output_path)
    def _topic_divergence(self, msg_ids, id2msg, dictionary, lda):
        raw_topics = [
            lda.get_document_topics(dictionary.doc2bow(
                IU.tokenize_document(id2msg[id_])),
                                    minimum_probability=0) for id_ in msg_ids
        ]
        topic_vects = np.array([[v for _, v in topics]
                                for topics in raw_topics])
        mean_topic_vect = np.mean(topic_vects, axis=0)
        diffs = [scipy.stats.entropy(mean_topic_vect, v) for v in topic_vects]

        return np.mean(diffs)
def main1():
    C, P, T1, T2 = ('CEO', 'PM', 'T1', 'T2')
    p = 'progress'
    s = 'suggetion'
    f = 'football'
    correct_edge_to_color = {
        ('a', 'b'): 'red',
        ('b', 'c'): 'red',
        ('c', 'd'): 'red',
        ('e', 'f'): 'green'
    }

    interactions = [('a', C, [P], p, 1), ('b', P, [T1, T2], p, 2),
                    ('c', T1, [P], p, 3), ('d', P, [C], p, 4),
                    ('e', T2, [P], s, 3), ('f', P, [C], p, 5),
                    ('g', T2, [T1], f, 4)]
    new_interactions = []
    for msg_id, sender, recs, topic, time in interactions:
        new_interactions.append(
            {
                'sender_id': sender,
                'recipient_ids': recs,
                'datetime': time,
                'message_id': msg_id
            }, )

    node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions(
        new_interactions)
    graph = convert_to_meta_graph(node_names, sources, targets, time_stamps)
    # nx.write_dot(graph, 'tmp/illustration.dot')
    print """digraph {
    node [fontsize=20];
"""
    for u, v in graph.edges():
        print "{} -> {}[color={}];".format(
            u,
            v,
            # correct_edge_to_color.get((u, v), 'gray')
            'black')
    print "}"

    df = pd.DataFrame(new_interactions,
                      columns=['sender_id', 'recipient_ids', 'datetime'],
                      index=[i[0] for i in interactions])
    df = df.rename(columns={
        'sender_id': 'sender',
        'recipient_ids': 'recipients',
        'datetime': 'time'
    })

    mapping = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri'}
    df['time'] = df['time'].map(lambda t: mapping[t])
    df.to_latex('tmp/example.tex')
def main():
    K, M, C, H = 'K', 'M', 'C', 'H'
    interactions = [
        {
            'sender_id': K,
            'recipient_ids': (M, C),
            'datetime': 1,
            'message_id': 'K->(M, C): code(1)'
        },
        {
            'sender_id': M,
            'recipient_ids': [K],
            'datetime': 3,
            'message_id': 'M->K: read(3)'
        },
        {
            'sender_id': K,
            'recipient_ids': [M],
            'datetime': 4,
            'message_id': 'K->M: read(3)'
        },
        {
            'sender_id': C,
            'recipient_ids': [H],
            'datetime': 2,
            'message_id': 'C->H: eat(2)'
        },
        {
            'sender_id': H,
            'recipient_ids': [C],
            'datetime': 3,
            'message_id': 'H->C: eat(2)'
        },
    ]
    InteractionsUtil.decompose_interactions(interactions)
    node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions(
        InteractionsUtil.decompose_interactions(interactions))
    graph = convert_to_meta_graph(node_names, sources, targets, time_stamps)

    print graph.edges()
    def frequent_terms(self, interactions, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body'])

        # topic_dist
        message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        tokens = IU.tokenize_document(concated_msg)
        freqs = Counter(tokens)
        terms = [t for t, _ in freqs.most_common(top_k)]
        print 'frequent_terms', terms
        return terms
Exemple #12
0
def draw_example_meta_graph(output_path):
    _, _, ints = load_meta_graph_necessities()
    g = IU.get_meta_graph(
        ints,
        decompose_interactions=True
    )
    plt.clf()
    pos = nx.spring_layout(g)
    nx.draw_networkx_nodes(g, pos=pos)
    nx.draw_networkx_edges(g, pos=pos)
    nx.draw_networkx_labels(g, pos=pos,
                            labels=dict(zip(g.nodes(), g.nodes())),
                            font_size=8)
    plt.savefig(output_path)
    def tfidf_terms(self, interactions, dictionary, top_k=10):
        text = '\n'.join(
            ['{} {}'.format(m['subject'], m['body']) for m in interactions])
        tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl'))
        counts = dictionary.doc2bow(IU.tokenize_document(text))
        raw_vect = np.zeros(len(dictionary.keys()))
        for word, cnt in counts:
            raw_vect[word] = cnt

        vect = tfidf_vec.transform([raw_vect])
        vect = np.asarray(vect.todense()).flatten()

        tfidf_terms = [dictionary[i] for i in np.argsort(vect)[::-1][:top_k]]
        print 'tfidf_terms', tfidf_terms
        return tfidf_terms
Exemple #14
0
    def frequent_terms(self, interactions, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(
                m['subject'], m['body']
            )

        # topic_dist
        message_ids = [self.g.node[n]['message_id']
                       for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        tokens = IU.tokenize_document(concated_msg)
        freqs = Counter(tokens)
        terms = [t for t, _ in freqs.most_common(top_k)]
        print 'frequent_terms', terms
        return terms
Exemple #15
0
    def topics(self, interactions, dictionary, lda, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(
                m['subject'], m['body']
            )

        # topic_dist
        message_ids = [self.g.node[n]['message_id']
                       for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        bow = dictionary.doc2bow(IU.tokenize_document(concated_msg))
        topic_dist = lda.__getitem__(bow, iterations=100)
        print("topic inference done")
        # topic_dist = lda.get_document_topics(
        #     bow,
        #     minimum_probability=0
        # )
        topic_dist = np.asarray([v for _, v in topic_dist])

        # some mask to filter out trivial topics
        topic_dist[topic_dist < 0.05] = 0

        # topic_terms
        if not hasattr(lda, 'wordtopics'):
            lda.load_word_topics()
        beta = lda.wordtopics
        # beta = lda.state.get_lambda()

        # normalize and weight by beta dist
        weighted_terms = (
            beta / beta.sum(axis=1)[:, None] * topic_dist[:, None]
        ).sum(axis=0)

        bestn = np.argsort(weighted_terms)[::-1][:top_k]

        topic_terms = [lda.id2word[id] for id in bestn]
        
        top_topics = np.nonzero(topic_dist)  # np.argsort(topic_dist)[::-1][:3]
        print('top_topics', top_topics)
        # topic_divergence = self._topic_divergence(message_ids, id2msg,
        #                                           dictionary, lda)
        return {# 'topic_dist': topic_dist,
                'topic_terms': topic_terms,
                'top_topics': top_topics
                # 'topic_divergence': topic_divergence
                }
def extract_event_context(interactions, event_tree, undirected=False):
    span = MetaGraphStat(event_tree).time_span()
    start = span['start_time']
    end = span['end_time']

    filtered_interactions = []
    for i in interactions:
        assert 'datetime' in i
        dt = i['datetime']
        assert isinstance(dt, datetime)
        if dt >= start and dt <= end:
            filtered_interactions.append(i)
    context_dag = IU.get_meta_graph(filtered_interactions,
                                    decompose_interactions=False,
                                    remove_singleton=True,
                                    undirected=undirected)
    return context_dag
Exemple #17
0
    def _topic_divergence(self, msg_ids, id2msg, dictionary, lda):
        raw_topics = [
            lda.get_document_topics(
                dictionary.doc2bow(
                    IU.tokenize_document(id2msg[id_])
                ),
                minimum_probability=0
            )
            for id_ in msg_ids
        ]
        topic_vects = np.array([[v for _, v in topics]
                                for topics in raw_topics])
        mean_topic_vect = np.mean(topic_vects, axis=0)
        diffs = [scipy.stats.entropy(mean_topic_vect, v)
                 for v in topic_vects]

        return np.mean(diffs)
    def __init__(self, g, B, timespan_secs, node_score_func=log_x_density):
        super(AdaptiveSampler, self).__init__(g, timespan_secs)

        non_leaf_roots = [n for n in g.nodes_iter() if g.out_degree(n) > 0]
        print("AdaptiveSampler: #roots to explore {}".format(
            len(non_leaf_roots)))

        print("AdaptiveSampler: getting upperbounds...")
        upperbounds = map(
            lambda r: quota_upperbound(
                IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), r,
                B), non_leaf_roots)

        print("AdaptiveSampler: sorting the roots by upperbound... ")
        inds = np.argsort(np.asarray(upperbounds))[::-1]  # descending order
        self.roots_sorted_by_upperbound = [non_leaf_roots[i] for i in inds]
        self.root2upperbound = {
            r: u
            for r, u in zip(non_leaf_roots, upperbounds)
        }

        # self.roots_sorted_by_upperbound = sorted(
        #     non_leaf_roots,
        #     key=lambda r: quota_upperbound(
        #         IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs),
        #         r, B),
        #     reverse=True
        # )
        self.node_score_func = node_score_func
        # self.root2upperbound = {r: quota_upperbound(
        #     IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs),
        #     r, B)
        #                         for r in non_leaf_roots
        # }

        # updated at each iteration
        # nodes that are partially/fully computed
        # excluding leaves
        self.covered_nodes = set()

        # exclude leaves
        # self.roots_to_explore = sorted((n for n in g.nodes_iter()
        #                                if g.out_degree(n) > 0))
        self.n_nodes_to_cover = len(self.roots_sorted_by_upperbound)

        self.node2score = {}
Exemple #19
0
    def tfidf_terms(self, interactions, dictionary, top_k=10):
        text = '\n'.join(['{} {}'.format(m['subject'], m['body'])
                   for m in interactions])
        tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl'))
        counts = dictionary.doc2bow(
            IU.tokenize_document(text)
            )
        raw_vect = np.zeros(len(dictionary.keys()))
        for word, cnt in counts:
            raw_vect[word] = cnt

        vect = tfidf_vec.transform([raw_vect])
        vect = np.asarray(vect.todense()).flatten()

        tfidf_terms = [dictionary[i]
                       for i in np.argsort(vect)[::-1][:top_k]]
        print 'tfidf_terms', tfidf_terms
        return tfidf_terms
Exemple #20
0
def test_gen_event_with_known_tree_structure():
    event_size = 100
    participants_n = 10
    event = gen_event_with_known_tree_structure(
        event_size=event_size,
        participants=range(participants_n),
        start_time=10, end_time=110,
        event_topic_param=random_topic(10, topic_noise=0.0001)[0],
        topic_noise=1,
        alpha=1.0, tau=0.8,
        forward_proba=0.3,
        reply_proba=0.5,
        create_new_proba=0.2
    )

    for n in event.nodes_iter():
        sid, rid = event.node[n]['sender_id'], event.node[n]['recipient_ids'][0]
        assert_true(sid != rid)

    for s, t in event.edges_iter():
        sid1, rid1 = event.node[s]['sender_id'], event.node[s]['recipient_ids'][0]
        sid2, rid2 = event.node[t]['sender_id'], event.node[t]['recipient_ids'][0]
        c_type = event[s][t]['c_type']
        if c_type == 'r':
            assert_equal(sid1, rid2)
            assert_equal(sid2, rid1)
        elif c_type == 'f':
            assert_equal(rid1, sid2)
            assert_true(rid2 != sid1)
        else:
            assert_equal(sid1, sid2)

    interactions = [event.node[n] for n in event.nodes_iter()]
    g = IU.get_meta_graph(
        interactions,
        decompose_interactions=False,
        remove_singleton=True,
        given_topics=True,
        convert_time=False
    )
    assert_equal(1, len(get_roots(g)))
    assert_equal(event_size, len(interactions))
    
    assert_true(nx.is_arborescence(event))
def build_default_summary_kws(interactions,
                              people_info,
                              dictionary,
                              lda,
                              people_repr_template,
                              undirected=False):
    interactions = IU.clean_interactions(interactions, undirected=undirected)
    summary_kws = {
        'basic_structure_stats': {},
        'time_span': {},
        # Deprecated
        'topics': {
            'interactions': interactions,
            'dictionary': dictionary,
            'lda': lda,
            'top_k': 10
        },
        'email_content': {
            'interactions': interactions,
            'top_k': 5
        },
        'participants': {
            'people_info': people_info,
            'interactions': interactions,
            'top_k': 5,
            'people_repr_template': people_repr_template,
            'undirected': undirected
        },
        'link_type_freq': {
            'interactions': interactions,
            'undirected': undirected
        },
        # 'frequent_terms': {
        #     'interactions': interactions,
        #     'top_k': 10
        # },
        # 'tfidf_terms': {
        #     'interactions': interactions,
        #     'dictionary': dictionary,
        #     'top_k': 10
        # }
    }
    return summary_kws
    def topics(self, interactions, dictionary, lda, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body'])

        # topic_dist
        message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        bow = dictionary.doc2bow(IU.tokenize_document(concated_msg))
        topic_dist = lda.__getitem__(bow, iterations=100)
        print("topic inference done")
        # topic_dist = lda.get_document_topics(
        #     bow,
        #     minimum_probability=0
        # )
        topic_dist = np.asarray([v for _, v in topic_dist])

        # some mask to filter out trivial topics
        topic_dist[topic_dist < 0.05] = 0

        # topic_terms
        if not hasattr(lda, 'wordtopics'):
            lda.load_word_topics()
        beta = lda.wordtopics
        # beta = lda.state.get_lambda()

        # normalize and weight by beta dist
        weighted_terms = (beta / beta.sum(axis=1)[:, None] *
                          topic_dist[:, None]).sum(axis=0)

        bestn = np.argsort(weighted_terms)[::-1][:top_k]

        topic_terms = [lda.id2word[id] for id in bestn]

        top_topics = np.nonzero(topic_dist)  # np.argsort(topic_dist)[::-1][:3]
        print('top_topics', top_topics)
        # topic_divergence = self._topic_divergence(message_ids, id2msg,
        #                                           dictionary, lda)
        return {  # 'topic_dist': topic_dist,
            'topic_terms': topic_terms,
            'top_topics': top_topics
            # 'topic_divergence': topic_divergence
        }
Exemple #23
0
    def __init__(self, g, B, timespan_secs, node_score_func=log_x_density):
        super(AdaptiveSampler, self).__init__(g, timespan_secs)

        non_leaf_roots = [n for n in g.nodes_iter() if g.out_degree(n) > 0]
        print("AdaptiveSampler: #roots to explore {}".format(len(non_leaf_roots)))

        print("AdaptiveSampler: getting upperbounds...")
        upperbounds = map(lambda r: quota_upperbound(
                IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs),
                r, B),
                          non_leaf_roots)

        print("AdaptiveSampler: sorting the roots by upperbound... ")
        inds = np.argsort(np.asarray(upperbounds))[::-1]  # descending order
        self.roots_sorted_by_upperbound = [non_leaf_roots[i] for i in inds]
        self.root2upperbound = {r: u
                                for r, u in zip(non_leaf_roots, upperbounds)}
        
        # self.roots_sorted_by_upperbound = sorted(
        #     non_leaf_roots,
        #     key=lambda r: quota_upperbound(
        #         IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs),
        #         r, B),
        #     reverse=True
        # )
        self.node_score_func = node_score_func
        # self.root2upperbound = {r: quota_upperbound(
        #     IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs),
        #     r, B)
        #                         for r in non_leaf_roots
        # }

        # updated at each iteration
        # nodes that are partially/fully computed
        # excluding leaves
        self.covered_nodes = set()

        # exclude leaves
        # self.roots_to_explore = sorted((n for n in g.nodes_iter()
        #                                if g.out_degree(n) > 0))
        self.n_nodes_to_cover = len(self.roots_sorted_by_upperbound)

        self.node2score = {}
def run_with_context(interactions_path,
                     candidate_tree_path,
                     dirname=None,
                     to_original_graph=False,
                     undirected=False):
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    try:
        interactions = json.load(open(interactions_path))
    except ValueError as e:
        print(e)
        interactions = load_json_by_line(interactions_path)

    interactions = IU.clean_interactions(interactions,
                                         undirected=undirected)

    output_path = get_output_path(candidate_tree_path, dirname)

    K = 5
    events = detect_events_given_path(candidate_tree_path, K)

    contexted_events = []
    for e in events:
        context_dag = extract_event_context(
            interactions, e,
            undirected=undirected
        )

        if to_original_graph:
            context_dag = convert_to_original_graph(context_dag)
            e = convert_to_original_graph(e)

        contexted_events.append(
            add_subgraph_specific_attributes_to_graph(
                context_dag, [(e, {'event': True})])
        )
    d3_events = [to_d3_graph(ce)
                 for ce in contexted_events]
    
    print('writing to {}'.format(output_path))
    json_dump(d3_events, output_path)
Exemple #25
0
def test_get_gen_cand_tree_params():
    event_size = 100
    participants_n = 10
    event = gen_event_with_known_tree_structure(
        event_size=event_size,
        participants=range(participants_n),
        start_time=10, end_time=110,
        event_topic_param=random_topic(10, topic_noise=0.1)[0],
        topic_noise=1,
        alpha=1.0, tau=0.8,
        forward_proba=0.3,
        reply_proba=0.5,
        create_new_proba=0.2
    )
    event = IU.assign_edge_weights(event, cosine)
    params = get_gen_cand_tree_params(event)

    assert_true(params['U'] > 0)
    assert_equal(99, params['preprune_secs'])
    assert_equal([0], params['roots'])
Exemple #26
0
def build_default_summary_kws(interactions, people_info,
                              dictionary, lda, people_repr_template,
                              undirected=False):
    interactions = IU.clean_interactions(interactions,
                                         undirected=undirected)
    summary_kws = {
        'basic_structure_stats': {},
        'time_span': {},
        # Deprecated
        'topics': {
            'interactions': interactions,
            'dictionary': dictionary,
            'lda': lda,
            'top_k': 10
        },
        'email_content': {
            'interactions': interactions,
            'top_k': 5
        },
        'participants': {
            'people_info': people_info,
            'interactions': interactions,
            'top_k': 5,
            'people_repr_template': people_repr_template,
            'undirected': undirected
        },
        'link_type_freq': {
            'interactions': interactions,
            'undirected': undirected
        },
        # 'frequent_terms': {
        #     'interactions': interactions,
        #     'top_k': 10
        # },
        # 'tfidf_terms': {
        #     'interactions': interactions,
        #     'dictionary': dictionary,
        #     'top_k': 10
        # }
    }
    return summary_kws
Exemple #27
0
    def test_make_artificial_data(self):
        events, all_interactions, params = make_artificial_data(**self.params)
        assert_equal(self.params['n_events'],
                     len(params))
        assert_equal(
            self.params['n_events'],
            len(events)
        )
        assert_equal(
            self.params['event_size_mu'] * self.params['n_events'] +
            self.params['n_noisy_interactions'],
            len(all_interactions)
        )
        for i in all_interactions:
            assert_true('message_id' in i)
            # make sure it's jsonable
            assert_true(isinstance(i['topics'], list))

        # all ids are unique
        all_ids = list(itertools.chain(*[e.nodes() for e in events]))
        assert_equal(len(all_ids), len(set(all_ids)))

        for e in events:
            # make sure nodes are relabeled
            for n in e.nodes_iter():
                assert_equal(n, e.node[n]['message_id'])

            interactions = [e.node[n] for n in e.nodes_iter()]
            assert_equal(len(interactions),
                         IU.get_meta_graph(
                             interactions,
                             decompose_interactions=False,
                             remove_singleton=True,
                             given_topics=True).number_of_nodes())
            for i in interactions:
                assert_true(isinstance(i['topics'], list))

        for i in all_interactions:
            assert_true(i['sender_id'].startswith('u-'))
def sample_rooted_binary_graphs_within_timespan(
        meta_graph_pickle_path,
        sample_number,
        timespan,
        output_path):
    g = nx.read_gpickle(meta_graph_pickle_path)
    roots = sample_nodes(g, sample_number)
    results = []
    for i, r in enumerate(roots):
        print('done:', i)
        sub_g = InteractionsUtil.get_rooted_subgraph_within_timespan(
            g, r, timespan
        )
        binary_sub_g = binarize_dag(sub_g,
                                    InteractionsUtil.VERTEX_REWARD_KEY,
                                    InteractionsUtil.EDGE_COST_KEY,
                                    dummy_node_name_prefix="d_")
        
        if len(binary_sub_g.edges()) > 0:
            results.append(binary_sub_g)

    pkl.dump(results, open(output_path, 'w'))
 def root_and_dag(self, r):
     return r, IU.get_rooted_subgraph_within_timespan(
         self.g, r, self.timespan_secs)
Exemple #30
0
def make_artificial_data(
        # for main events
        n_events,
        event_size_mu, event_size_sigma,
        participant_mu, participant_sigma,
        # for minor events
        n_minor_events,
        minor_event_size_mu, minor_event_size_sigma,
        minor_event_participant_mu, minor_event_participant_sigma,
        # shared
        n_total_participants,
        min_time, max_time, event_duration_mu, event_duration_sigma,
        n_topics, topic_scaling_factor, topic_noise,
        n_noisy_interactions, n_noisy_interactions_fraction,
        alpha, tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        dist_func):
    events, taboo_topics = random_events(
        n_events, event_size_mu, event_size_sigma,
        n_total_participants, participant_mu, participant_sigma,
        min_time, max_time, event_duration_mu, event_duration_sigma,
        n_topics, topic_scaling_factor, topic_noise,
        alpha, tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        accumulate_taboo=True
    )

    minor_events, _ = random_events(
        n_minor_events, minor_event_size_mu, minor_event_size_sigma,
        n_total_participants, minor_event_participant_mu,
        minor_event_participant_sigma,
        min_time, max_time, event_duration_mu, event_duration_sigma,
        n_topics, topic_scaling_factor, topic_noise,
        alpha, tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        taboo_topics=taboo_topics,
        accumulate_taboo=False
    )
    
    (n_noisy_interactions, _) = get_number_and_percentage(
        sum([1 for e in events for _ in e]),
        n_noisy_interactions, n_noisy_interactions_fraction
    )
    noisy_interactions = random_noisy_interactions(
        n_noisy_interactions,
        min_time, max_time,
        n_total_participants,
        n_topics, topic_noise,
        taboo_topics
    )

    event_interactions = [e.node[n] for e in events
                          for n in e.nodes_iter()]
    minor_event_interactions = [e.node[n] for e in minor_events
                                for n in e.nodes_iter()]
    all_interactions = (event_interactions + minor_event_interactions
                        + noisy_interactions)

    # add interaction id
    for i, intr in enumerate(all_interactions):
        intr['message_id'] = i
        intr['topics'] = intr['topics'].tolist()

    # relabel the nodes
    relabeled_events = []
    for e in events:
        mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()}
        relabeled_events.append(nx.relabel_nodes(e, mapping))

    for e in events:
        e = IU.assign_edge_weights(e, dist_func)

    gen_cand_trees_params = [get_gen_cand_tree_params(e)
                             for e in events]
    return relabeled_events, all_interactions, gen_cand_trees_params
Exemple #31
0
def random_events(n_events, event_size_mu, event_size_sigma,
                  n_total_participants, participant_mu, participant_sigma,
                  min_time, max_time, event_duration_mu, event_duration_sigma,
                  n_topics, topic_scaling_factor, topic_noise,
                  alpha, tau,
                  forward_proba,
                  reply_proba,
                  create_new_proba,
                  taboo_topics=set(),
                  accumulate_taboo=False):
    # add main events
    events = []
    taboo_topics = set(taboo_topics)
    
    for i in xrange(n_events):
        # randomly select a topic and add some noise to it
        event = []

        event_topic_param, topic_id = random_topic(
            n_topics,
            topic_noise,
            taboo_topics
        )

        if accumulate_taboo:
            taboo_topics.add(topic_id)

        print('event_topic_param:', event_topic_param)
        event_size = 0
        while event_size <= 0:
            event_size = int(round(
                np.random.normal(event_size_mu, event_size_sigma)
            ))
        assert event_size > 0

        # randomly select participants
        n_participants = 0
        while n_participants <= 2:
            n_participants = int(round(
                np.random.normal(participant_mu, participant_sigma)
            ))
        assert n_participants > 2
        
        participants = np.random.permutation(
            n_total_participants
        )[:n_participants]
        print('participants:', participants)

        # event timespan
        start_time = np.random.uniform(min_time, max_time - event_duration_mu)
        end_time = start_time + np.random.normal(event_duration_mu,
                                                 event_duration_sigma)
        if end_time > max_time:
            end_time = max_time

        event = gen_event_with_known_tree_structure(
            event_size, participants, start_time, end_time,
            event_topic_param,
            topic_noise,
            alpha, tau,
            forward_proba,
            reply_proba,
            create_new_proba
        )

        # some checking
        g = IU.get_meta_graph(
            [event.node[n] for n in event.nodes_iter()],
            decompose_interactions=False,
            remove_singleton=True,
            given_topics=True,
            convert_time=False)
        n_interactions_in_mg = g.number_of_nodes()

        if n_interactions_in_mg == len(event):
            roots = [n
                     for n, d in g.in_degree(g.nodes_iter()).items()
                     if d == 0]
            if len(roots) > 1:
                print(roots)
                for r in roots:
                    print(event[r])
                print("WARNING: roots number {}".format(len(roots)))
                raise
        else:
            print(
                'invalid meta graph. {} < {}'.format(
                    n_interactions_in_mg,
                    len(event)
                ))
            raise
        events.append(event)

    return events, taboo_topics
Exemple #32
0
 def setUp(self):
     self.interactions = IU.clean_interactions(
         json_load(make_path('test/data/enron_test.json')))
def main1():
    C, P, T1, T2 = ('CEO', 'PM', 'T1', 'T2')
    p = 'progress'
    s = 'suggetion'
    f = 'football'
    correct_edge_to_color = {
        ('a', 'b'): 'red',
        ('b', 'c'): 'red',
        ('c', 'd'): 'red',
        ('e', 'f'): 'green'
    }

    interactions = [
        ('a', C, [P], p, 1),
        ('b', P, [T1, T2], p, 2),
        ('c', T1, [P], p, 3),
        ('d', P, [C], p, 4),
        ('e', T2, [P], s, 3),
        ('f', P, [C], p, 5),
        ('g', T2, [T1], f, 4)
    ]
    new_interactions = []
    for msg_id, sender, recs, topic, time in interactions:
        new_interactions.append(
            {'sender_id': sender,
             'recipient_ids': recs,
             'datetime': time,
             'message_id': msg_id},
        )
    
    node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions(
        new_interactions
    )
    graph = convert_to_meta_graph(node_names, sources, targets, time_stamps)
    # nx.write_dot(graph, 'tmp/illustration.dot')
    print """digraph {
    node [fontsize=20];
"""
    for u, v in  graph.edges():
        print "{} -> {}[color={}];".format(
            u, v,
            # correct_edge_to_color.get((u, v), 'gray')
            'black'
        )
    print "}"
    
    df = pd.DataFrame(new_interactions,
                      columns=['sender_id', 'recipient_ids', 'datetime'],
                      index=[i[0] for i in interactions])
    df = df.rename(columns={'sender_id': 'sender',
                       'recipient_ids': 'recipients',
                       'datetime': 'time'})

    mapping = {
        1: 'Mon',
        2: 'Tue',
        3: 'Wed',
        4: 'Thu',
        5: 'Fri'
    }
    df['time'] = df['time'].map(lambda t: mapping[t])
    df.to_latex('tmp/example.tex')
Exemple #34
0
 def setUp(self):
     self.interactions = IU.clean_interactions(
         json_load(
             make_path('test/data/enron_test.json')
         )
     )
def run(gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {'topics': 0.2,
                                 'bow': 0.8},
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']
        
    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))


    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path)
        )
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path)
        )
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix,
        experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix
    )
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied
        )

        logger.info('pickling...')
        nx.write_gpickle(
            IU.compactize_meta_graph(g, map_nodes=False),
            meta_graph_pkl_path
        )
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)
        
    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(),
            cand_tree_number,
            cand_tree_percent
        )
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)
    
    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(
        cand_tree_number / float(g.number_of_nodes()))
    )

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)
        
        
        start = datetime.now()
        tree = calc_tree(i, root, dag, U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() - start).total_seconds()
        
        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix,
            experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(
                cand_tree_percent=cand_tree_percent,
                root_sampling=root_sampling_method
            ),
            suffix
        )
    result_pkl_path = make_detailed_path(result_pkl_path_prefix,
                                         result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path+'.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)
    
    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {'interactions': interaction_path,
                  'meta_graph': meta_graph_pkl_path,
                  'result': result_pkl_path,
                  'true_events': true_events_path,
                  'self': all_paths_pkl_path
    }
    pickle.dump(
        paths_dict,
        open(all_paths_pkl_path, 'w')
    )
    return paths_dict
def run(
        gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {
                'topics': 0.2,
                'bow': 0.8
            },
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']

    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))

    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path))
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path))
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix)
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[
                'preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied)

        logger.info('pickling...')
        nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False),
                         meta_graph_pkl_path)
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)

    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(), cand_tree_number, cand_tree_percent)
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)

    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(cand_tree_number /
                                                float(g.number_of_nodes())))

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)

        start = datetime.now()
        tree = calc_tree(i,
                         root,
                         dag,
                         U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() -
                                          start).total_seconds()

        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix, experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(cand_tree_percent=cand_tree_percent,
                                 root_sampling=root_sampling_method), suffix)

    result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path + '.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)

    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {
        'interactions': interaction_path,
        'meta_graph': meta_graph_pkl_path,
        'result': result_pkl_path,
        'true_events': true_events_path,
        'self': all_paths_pkl_path
    }
    pickle.dump(paths_dict, open(all_paths_pkl_path, 'w'))
    return paths_dict
Exemple #37
0
 def root_and_dag(self, r):
     return r, IU.get_rooted_subgraph_within_timespan(
         self.g, r, self.timespan_secs
     )
Exemple #38
0
def random_events(n_events,
                  event_size_mu,
                  event_size_sigma,
                  n_total_participants,
                  participant_mu,
                  participant_sigma,
                  min_time,
                  max_time,
                  event_duration_mu,
                  event_duration_sigma,
                  n_topics,
                  topic_scaling_factor,
                  topic_noise,
                  alpha,
                  tau,
                  forward_proba,
                  reply_proba,
                  create_new_proba,
                  taboo_topics=set(),
                  accumulate_taboo=False):
    # add main events
    events = []
    taboo_topics = set(taboo_topics)

    for i in xrange(n_events):
        # randomly select a topic and add some noise to it
        event = []

        event_topic_param, topic_id = random_topic(n_topics, topic_noise,
                                                   taboo_topics)

        if accumulate_taboo:
            taboo_topics.add(topic_id)

        print('event_topic_param:', event_topic_param)
        event_size = 0
        while event_size <= 0:
            event_size = int(
                round(np.random.normal(event_size_mu, event_size_sigma)))
        assert event_size > 0

        # randomly select participants
        n_participants = 0
        while n_participants <= 2:
            n_participants = int(
                round(np.random.normal(participant_mu, participant_sigma)))
        assert n_participants > 2

        participants = np.random.permutation(
            n_total_participants)[:n_participants]
        print('participants:', participants)

        # event timespan
        start_time = np.random.uniform(min_time, max_time - event_duration_mu)
        end_time = start_time + np.random.normal(event_duration_mu,
                                                 event_duration_sigma)
        if end_time > max_time:
            end_time = max_time

        event = gen_event_with_known_tree_structure(event_size, participants,
                                                    start_time, end_time,
                                                    event_topic_param,
                                                    topic_noise, alpha, tau,
                                                    forward_proba, reply_proba,
                                                    create_new_proba)

        # some checking
        g = IU.get_meta_graph([event.node[n] for n in event.nodes_iter()],
                              decompose_interactions=False,
                              remove_singleton=True,
                              given_topics=True,
                              convert_time=False)
        n_interactions_in_mg = g.number_of_nodes()

        if n_interactions_in_mg == len(event):
            roots = [
                n for n, d in g.in_degree(g.nodes_iter()).items() if d == 0
            ]
            if len(roots) > 1:
                print(roots)
                for r in roots:
                    print(event[r])
                print("WARNING: roots number {}".format(len(roots)))
                raise
        else:
            print('invalid meta graph. {} < {}'.format(n_interactions_in_mg,
                                                       len(event)))
            raise
        events.append(event)

    return events, taboo_topics
Exemple #39
0
def make_artificial_data(
        # for main events
        n_events,
        event_size_mu,
        event_size_sigma,
        participant_mu,
        participant_sigma,
        # for minor events
        n_minor_events,
        minor_event_size_mu,
        minor_event_size_sigma,
        minor_event_participant_mu,
        minor_event_participant_sigma,
        # shared
        n_total_participants,
        min_time,
        max_time,
        event_duration_mu,
        event_duration_sigma,
        n_topics,
        topic_scaling_factor,
        topic_noise,
        n_noisy_interactions,
        n_noisy_interactions_fraction,
        alpha,
        tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        dist_func):
    events, taboo_topics = random_events(n_events,
                                         event_size_mu,
                                         event_size_sigma,
                                         n_total_participants,
                                         participant_mu,
                                         participant_sigma,
                                         min_time,
                                         max_time,
                                         event_duration_mu,
                                         event_duration_sigma,
                                         n_topics,
                                         topic_scaling_factor,
                                         topic_noise,
                                         alpha,
                                         tau,
                                         forward_proba,
                                         reply_proba,
                                         create_new_proba,
                                         accumulate_taboo=True)

    minor_events, _ = random_events(n_minor_events,
                                    minor_event_size_mu,
                                    minor_event_size_sigma,
                                    n_total_participants,
                                    minor_event_participant_mu,
                                    minor_event_participant_sigma,
                                    min_time,
                                    max_time,
                                    event_duration_mu,
                                    event_duration_sigma,
                                    n_topics,
                                    topic_scaling_factor,
                                    topic_noise,
                                    alpha,
                                    tau,
                                    forward_proba,
                                    reply_proba,
                                    create_new_proba,
                                    taboo_topics=taboo_topics,
                                    accumulate_taboo=False)

    (n_noisy_interactions,
     _) = get_number_and_percentage(sum([1 for e in events for _ in e]),
                                    n_noisy_interactions,
                                    n_noisy_interactions_fraction)
    noisy_interactions = random_noisy_interactions(n_noisy_interactions,
                                                   min_time, max_time,
                                                   n_total_participants,
                                                   n_topics, topic_noise,
                                                   taboo_topics)

    event_interactions = [e.node[n] for e in events for n in e.nodes_iter()]
    minor_event_interactions = [
        e.node[n] for e in minor_events for n in e.nodes_iter()
    ]
    all_interactions = (event_interactions + minor_event_interactions +
                        noisy_interactions)

    # add interaction id
    for i, intr in enumerate(all_interactions):
        intr['message_id'] = i
        intr['topics'] = intr['topics'].tolist()

    # relabel the nodes
    relabeled_events = []
    for e in events:
        mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()}
        relabeled_events.append(nx.relabel_nodes(e, mapping))

    for e in events:
        e = IU.assign_edge_weights(e, dist_func)

    gen_cand_trees_params = [get_gen_cand_tree_params(e) for e in events]
    return relabeled_events, all_interactions, gen_cand_trees_params
     'bow': 0.8},
    {'topics': 1.0},
    {'bow': 1.0},
]

for weights in different_weights:
    meta_graph_kws = {
        'distance_weights': weights,       
    }

    g = IU.get_topic_meta_graph(
        interactions,
        lda_model=lda_model,
        dictionary=dictionary,
        undirected=False,
        given_topics=False,
        decompose_interactions=False,
        dist_func=cosine,
        preprune_secs=timedelta(weeks=4).total_seconds(),
        apply_pagerank=False,
        **meta_graph_kws
    )
    
    print('weights: {}\n'.format(weights))

    out_degrees = g.out_degree(g.nodes())
    sorted_nodes = sorted(out_degrees,
                         key=lambda k: out_degrees[k],
                         reverse=True)
    print('\n'.join(map(lambda n: g.node[n]['subject'], sorted_nodes)[:10]))

    node = sorted_nodes[5]
    {
        'bow': 1.0
    },
]

for weights in different_weights:
    meta_graph_kws = {
        'distance_weights': weights,
    }

    g = IU.get_topic_meta_graph(
        interactions,
        lda_model=lda_model,
        dictionary=dictionary,
        undirected=False,
        given_topics=False,
        decompose_interactions=False,
        dist_func=cosine,
        preprune_secs=timedelta(weeks=4).total_seconds(),
        apply_pagerank=False,
        **meta_graph_kws)

    print('weights: {}\n'.format(weights))

    out_degrees = g.out_degree(g.nodes())
    sorted_nodes = sorted(out_degrees,
                          key=lambda k: out_degrees[k],
                          reverse=True)
    print('\n'.join(map(lambda n: g.node[n]['subject'], sorted_nodes)[:10]))

    node = sorted_nodes[5]