def test_bs_ensure_result_is_tree(self): params = pkl.load( open(make_path('test/data/quota_test_cases/params.pkl')))[0] root = params['roots'][0] preprune_secs = params['preprune_secs'] mg = IU.get_topic_meta_graph_from_synthetic( make_path('test/data/quota_test_cases/interactions.json'), preprune_secs) dag = IU.get_rooted_subgraph_within_timespan(mg, root, preprune_secs) t = charikar_algo(dag, root, dag.nodes(), k=20, level=2) assert_true(nx.is_arborescence(t))
def run_with_context(interactions_path, candidate_tree_path, dirname=None, to_original_graph=False, undirected=False): if not os.path.exists(dirname): os.makedirs(dirname) try: interactions = json.load(open(interactions_path)) except ValueError as e: print(e) interactions = load_json_by_line(interactions_path) interactions = IU.clean_interactions(interactions, undirected=undirected) output_path = get_output_path(candidate_tree_path, dirname) K = 5 events = detect_events_given_path(candidate_tree_path, K) contexted_events = [] for e in events: context_dag = extract_event_context(interactions, e, undirected=undirected) if to_original_graph: context_dag = convert_to_original_graph(context_dag) e = convert_to_original_graph(e) contexted_events.append(add_subgraph_specific_attributes_to_graph(context_dag, [(e, {"event": True})])) d3_events = [to_d3_graph(ce) for ce in contexted_events] print("writing to {}".format(output_path)) json_dump(d3_events, output_path)
def main(): K, M, C, H = 'K', 'M', 'C', 'H' interactions = [ {'sender_id': K, 'recipient_ids': (M, C), 'datetime': 1, 'message_id': 'K->(M, C): code(1)'}, {'sender_id': M, 'recipient_ids': [K], 'datetime': 3, 'message_id': 'M->K: read(3)'}, {'sender_id': K, 'recipient_ids': [M], 'datetime': 4, 'message_id': 'K->M: read(3)'}, {'sender_id': C, 'recipient_ids': [H], 'datetime': 2, 'message_id': 'C->H: eat(2)'}, {'sender_id': H, 'recipient_ids': [C], 'datetime': 3, 'message_id': 'H->C: eat(2)'}, ] InteractionsUtil.decompose_interactions(interactions) node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions( InteractionsUtil.decompose_interactions(interactions) ) graph = convert_to_meta_graph(node_names, sources, targets, time_stamps) print graph.edges()
def test_bs_ensure_result_is_tree(self): params = pkl.load( open(make_path('test/data/quota_test_cases/params.pkl')) )[0] root = params['roots'][0] preprune_secs = params['preprune_secs'] mg = IU.get_topic_meta_graph_from_synthetic( make_path('test/data/quota_test_cases/interactions.json'), preprune_secs ) dag = IU.get_rooted_subgraph_within_timespan( mg, root, preprune_secs ) t = charikar_algo(dag, root, dag.nodes(), k=20, level=2) assert_true(nx.is_arborescence(t))
def __init__(self, g, B, timespan_secs): super(UBSampler, self).__init__(g, timespan_secs) non_leaf_roots = (n for n in g.nodes_iter() if g.out_degree(n) > 0) self.nodes_sorted_by_upperbound = sorted( non_leaf_roots, key=lambda r: quota_upperbound( IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), r, B), reverse=True)
def __init__(self, g, B, timespan_secs): super(UBSampler, self).__init__(g, timespan_secs) non_leaf_roots = (n for n in g.nodes_iter() if g.out_degree(n) > 0) self.nodes_sorted_by_upperbound = sorted( non_leaf_roots, key=lambda r: quota_upperbound( IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), r, B), reverse=True )
def draw_example_meta_graph(output_path): _, _, ints = load_meta_graph_necessities() g = IU.get_meta_graph(ints, decompose_interactions=True) plt.clf() pos = nx.spring_layout(g) nx.draw_networkx_nodes(g, pos=pos) nx.draw_networkx_edges(g, pos=pos) nx.draw_networkx_labels(g, pos=pos, labels=dict(zip(g.nodes(), g.nodes())), font_size=8) plt.savefig(output_path)
def _topic_divergence(self, msg_ids, id2msg, dictionary, lda): raw_topics = [ lda.get_document_topics(dictionary.doc2bow( IU.tokenize_document(id2msg[id_])), minimum_probability=0) for id_ in msg_ids ] topic_vects = np.array([[v for _, v in topics] for topics in raw_topics]) mean_topic_vect = np.mean(topic_vects, axis=0) diffs = [scipy.stats.entropy(mean_topic_vect, v) for v in topic_vects] return np.mean(diffs)
def main1(): C, P, T1, T2 = ('CEO', 'PM', 'T1', 'T2') p = 'progress' s = 'suggetion' f = 'football' correct_edge_to_color = { ('a', 'b'): 'red', ('b', 'c'): 'red', ('c', 'd'): 'red', ('e', 'f'): 'green' } interactions = [('a', C, [P], p, 1), ('b', P, [T1, T2], p, 2), ('c', T1, [P], p, 3), ('d', P, [C], p, 4), ('e', T2, [P], s, 3), ('f', P, [C], p, 5), ('g', T2, [T1], f, 4)] new_interactions = [] for msg_id, sender, recs, topic, time in interactions: new_interactions.append( { 'sender_id': sender, 'recipient_ids': recs, 'datetime': time, 'message_id': msg_id }, ) node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions( new_interactions) graph = convert_to_meta_graph(node_names, sources, targets, time_stamps) # nx.write_dot(graph, 'tmp/illustration.dot') print """digraph { node [fontsize=20]; """ for u, v in graph.edges(): print "{} -> {}[color={}];".format( u, v, # correct_edge_to_color.get((u, v), 'gray') 'black') print "}" df = pd.DataFrame(new_interactions, columns=['sender_id', 'recipient_ids', 'datetime'], index=[i[0] for i in interactions]) df = df.rename(columns={ 'sender_id': 'sender', 'recipient_ids': 'recipients', 'datetime': 'time' }) mapping = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri'} df['time'] = df['time'].map(lambda t: mapping[t]) df.to_latex('tmp/example.tex')
def main(): K, M, C, H = 'K', 'M', 'C', 'H' interactions = [ { 'sender_id': K, 'recipient_ids': (M, C), 'datetime': 1, 'message_id': 'K->(M, C): code(1)' }, { 'sender_id': M, 'recipient_ids': [K], 'datetime': 3, 'message_id': 'M->K: read(3)' }, { 'sender_id': K, 'recipient_ids': [M], 'datetime': 4, 'message_id': 'K->M: read(3)' }, { 'sender_id': C, 'recipient_ids': [H], 'datetime': 2, 'message_id': 'C->H: eat(2)' }, { 'sender_id': H, 'recipient_ids': [C], 'datetime': 3, 'message_id': 'H->C: eat(2)' }, ] InteractionsUtil.decompose_interactions(interactions) node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions( InteractionsUtil.decompose_interactions(interactions)) graph = convert_to_meta_graph(node_names, sources, targets, time_stamps) print graph.edges()
def frequent_terms(self, interactions, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body']) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) tokens = IU.tokenize_document(concated_msg) freqs = Counter(tokens) terms = [t for t, _ in freqs.most_common(top_k)] print 'frequent_terms', terms return terms
def draw_example_meta_graph(output_path): _, _, ints = load_meta_graph_necessities() g = IU.get_meta_graph( ints, decompose_interactions=True ) plt.clf() pos = nx.spring_layout(g) nx.draw_networkx_nodes(g, pos=pos) nx.draw_networkx_edges(g, pos=pos) nx.draw_networkx_labels(g, pos=pos, labels=dict(zip(g.nodes(), g.nodes())), font_size=8) plt.savefig(output_path)
def tfidf_terms(self, interactions, dictionary, top_k=10): text = '\n'.join( ['{} {}'.format(m['subject'], m['body']) for m in interactions]) tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl')) counts = dictionary.doc2bow(IU.tokenize_document(text)) raw_vect = np.zeros(len(dictionary.keys())) for word, cnt in counts: raw_vect[word] = cnt vect = tfidf_vec.transform([raw_vect]) vect = np.asarray(vect.todense()).flatten() tfidf_terms = [dictionary[i] for i in np.argsort(vect)[::-1][:top_k]] print 'tfidf_terms', tfidf_terms return tfidf_terms
def frequent_terms(self, interactions, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format( m['subject'], m['body'] ) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) tokens = IU.tokenize_document(concated_msg) freqs = Counter(tokens) terms = [t for t, _ in freqs.most_common(top_k)] print 'frequent_terms', terms return terms
def topics(self, interactions, dictionary, lda, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format( m['subject'], m['body'] ) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) bow = dictionary.doc2bow(IU.tokenize_document(concated_msg)) topic_dist = lda.__getitem__(bow, iterations=100) print("topic inference done") # topic_dist = lda.get_document_topics( # bow, # minimum_probability=0 # ) topic_dist = np.asarray([v for _, v in topic_dist]) # some mask to filter out trivial topics topic_dist[topic_dist < 0.05] = 0 # topic_terms if not hasattr(lda, 'wordtopics'): lda.load_word_topics() beta = lda.wordtopics # beta = lda.state.get_lambda() # normalize and weight by beta dist weighted_terms = ( beta / beta.sum(axis=1)[:, None] * topic_dist[:, None] ).sum(axis=0) bestn = np.argsort(weighted_terms)[::-1][:top_k] topic_terms = [lda.id2word[id] for id in bestn] top_topics = np.nonzero(topic_dist) # np.argsort(topic_dist)[::-1][:3] print('top_topics', top_topics) # topic_divergence = self._topic_divergence(message_ids, id2msg, # dictionary, lda) return {# 'topic_dist': topic_dist, 'topic_terms': topic_terms, 'top_topics': top_topics # 'topic_divergence': topic_divergence }
def extract_event_context(interactions, event_tree, undirected=False): span = MetaGraphStat(event_tree).time_span() start = span['start_time'] end = span['end_time'] filtered_interactions = [] for i in interactions: assert 'datetime' in i dt = i['datetime'] assert isinstance(dt, datetime) if dt >= start and dt <= end: filtered_interactions.append(i) context_dag = IU.get_meta_graph(filtered_interactions, decompose_interactions=False, remove_singleton=True, undirected=undirected) return context_dag
def _topic_divergence(self, msg_ids, id2msg, dictionary, lda): raw_topics = [ lda.get_document_topics( dictionary.doc2bow( IU.tokenize_document(id2msg[id_]) ), minimum_probability=0 ) for id_ in msg_ids ] topic_vects = np.array([[v for _, v in topics] for topics in raw_topics]) mean_topic_vect = np.mean(topic_vects, axis=0) diffs = [scipy.stats.entropy(mean_topic_vect, v) for v in topic_vects] return np.mean(diffs)
def __init__(self, g, B, timespan_secs, node_score_func=log_x_density): super(AdaptiveSampler, self).__init__(g, timespan_secs) non_leaf_roots = [n for n in g.nodes_iter() if g.out_degree(n) > 0] print("AdaptiveSampler: #roots to explore {}".format( len(non_leaf_roots))) print("AdaptiveSampler: getting upperbounds...") upperbounds = map( lambda r: quota_upperbound( IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), r, B), non_leaf_roots) print("AdaptiveSampler: sorting the roots by upperbound... ") inds = np.argsort(np.asarray(upperbounds))[::-1] # descending order self.roots_sorted_by_upperbound = [non_leaf_roots[i] for i in inds] self.root2upperbound = { r: u for r, u in zip(non_leaf_roots, upperbounds) } # self.roots_sorted_by_upperbound = sorted( # non_leaf_roots, # key=lambda r: quota_upperbound( # IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), # r, B), # reverse=True # ) self.node_score_func = node_score_func # self.root2upperbound = {r: quota_upperbound( # IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), # r, B) # for r in non_leaf_roots # } # updated at each iteration # nodes that are partially/fully computed # excluding leaves self.covered_nodes = set() # exclude leaves # self.roots_to_explore = sorted((n for n in g.nodes_iter() # if g.out_degree(n) > 0)) self.n_nodes_to_cover = len(self.roots_sorted_by_upperbound) self.node2score = {}
def tfidf_terms(self, interactions, dictionary, top_k=10): text = '\n'.join(['{} {}'.format(m['subject'], m['body']) for m in interactions]) tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl')) counts = dictionary.doc2bow( IU.tokenize_document(text) ) raw_vect = np.zeros(len(dictionary.keys())) for word, cnt in counts: raw_vect[word] = cnt vect = tfidf_vec.transform([raw_vect]) vect = np.asarray(vect.todense()).flatten() tfidf_terms = [dictionary[i] for i in np.argsort(vect)[::-1][:top_k]] print 'tfidf_terms', tfidf_terms return tfidf_terms
def test_gen_event_with_known_tree_structure(): event_size = 100 participants_n = 10 event = gen_event_with_known_tree_structure( event_size=event_size, participants=range(participants_n), start_time=10, end_time=110, event_topic_param=random_topic(10, topic_noise=0.0001)[0], topic_noise=1, alpha=1.0, tau=0.8, forward_proba=0.3, reply_proba=0.5, create_new_proba=0.2 ) for n in event.nodes_iter(): sid, rid = event.node[n]['sender_id'], event.node[n]['recipient_ids'][0] assert_true(sid != rid) for s, t in event.edges_iter(): sid1, rid1 = event.node[s]['sender_id'], event.node[s]['recipient_ids'][0] sid2, rid2 = event.node[t]['sender_id'], event.node[t]['recipient_ids'][0] c_type = event[s][t]['c_type'] if c_type == 'r': assert_equal(sid1, rid2) assert_equal(sid2, rid1) elif c_type == 'f': assert_equal(rid1, sid2) assert_true(rid2 != sid1) else: assert_equal(sid1, sid2) interactions = [event.node[n] for n in event.nodes_iter()] g = IU.get_meta_graph( interactions, decompose_interactions=False, remove_singleton=True, given_topics=True, convert_time=False ) assert_equal(1, len(get_roots(g))) assert_equal(event_size, len(interactions)) assert_true(nx.is_arborescence(event))
def build_default_summary_kws(interactions, people_info, dictionary, lda, people_repr_template, undirected=False): interactions = IU.clean_interactions(interactions, undirected=undirected) summary_kws = { 'basic_structure_stats': {}, 'time_span': {}, # Deprecated 'topics': { 'interactions': interactions, 'dictionary': dictionary, 'lda': lda, 'top_k': 10 }, 'email_content': { 'interactions': interactions, 'top_k': 5 }, 'participants': { 'people_info': people_info, 'interactions': interactions, 'top_k': 5, 'people_repr_template': people_repr_template, 'undirected': undirected }, 'link_type_freq': { 'interactions': interactions, 'undirected': undirected }, # 'frequent_terms': { # 'interactions': interactions, # 'top_k': 10 # }, # 'tfidf_terms': { # 'interactions': interactions, # 'dictionary': dictionary, # 'top_k': 10 # } } return summary_kws
def topics(self, interactions, dictionary, lda, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body']) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) bow = dictionary.doc2bow(IU.tokenize_document(concated_msg)) topic_dist = lda.__getitem__(bow, iterations=100) print("topic inference done") # topic_dist = lda.get_document_topics( # bow, # minimum_probability=0 # ) topic_dist = np.asarray([v for _, v in topic_dist]) # some mask to filter out trivial topics topic_dist[topic_dist < 0.05] = 0 # topic_terms if not hasattr(lda, 'wordtopics'): lda.load_word_topics() beta = lda.wordtopics # beta = lda.state.get_lambda() # normalize and weight by beta dist weighted_terms = (beta / beta.sum(axis=1)[:, None] * topic_dist[:, None]).sum(axis=0) bestn = np.argsort(weighted_terms)[::-1][:top_k] topic_terms = [lda.id2word[id] for id in bestn] top_topics = np.nonzero(topic_dist) # np.argsort(topic_dist)[::-1][:3] print('top_topics', top_topics) # topic_divergence = self._topic_divergence(message_ids, id2msg, # dictionary, lda) return { # 'topic_dist': topic_dist, 'topic_terms': topic_terms, 'top_topics': top_topics # 'topic_divergence': topic_divergence }
def __init__(self, g, B, timespan_secs, node_score_func=log_x_density): super(AdaptiveSampler, self).__init__(g, timespan_secs) non_leaf_roots = [n for n in g.nodes_iter() if g.out_degree(n) > 0] print("AdaptiveSampler: #roots to explore {}".format(len(non_leaf_roots))) print("AdaptiveSampler: getting upperbounds...") upperbounds = map(lambda r: quota_upperbound( IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), r, B), non_leaf_roots) print("AdaptiveSampler: sorting the roots by upperbound... ") inds = np.argsort(np.asarray(upperbounds))[::-1] # descending order self.roots_sorted_by_upperbound = [non_leaf_roots[i] for i in inds] self.root2upperbound = {r: u for r, u in zip(non_leaf_roots, upperbounds)} # self.roots_sorted_by_upperbound = sorted( # non_leaf_roots, # key=lambda r: quota_upperbound( # IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), # r, B), # reverse=True # ) self.node_score_func = node_score_func # self.root2upperbound = {r: quota_upperbound( # IU.get_rooted_subgraph_within_timespan(g, r, timespan_secs), # r, B) # for r in non_leaf_roots # } # updated at each iteration # nodes that are partially/fully computed # excluding leaves self.covered_nodes = set() # exclude leaves # self.roots_to_explore = sorted((n for n in g.nodes_iter() # if g.out_degree(n) > 0)) self.n_nodes_to_cover = len(self.roots_sorted_by_upperbound) self.node2score = {}
def run_with_context(interactions_path, candidate_tree_path, dirname=None, to_original_graph=False, undirected=False): if not os.path.exists(dirname): os.makedirs(dirname) try: interactions = json.load(open(interactions_path)) except ValueError as e: print(e) interactions = load_json_by_line(interactions_path) interactions = IU.clean_interactions(interactions, undirected=undirected) output_path = get_output_path(candidate_tree_path, dirname) K = 5 events = detect_events_given_path(candidate_tree_path, K) contexted_events = [] for e in events: context_dag = extract_event_context( interactions, e, undirected=undirected ) if to_original_graph: context_dag = convert_to_original_graph(context_dag) e = convert_to_original_graph(e) contexted_events.append( add_subgraph_specific_attributes_to_graph( context_dag, [(e, {'event': True})]) ) d3_events = [to_d3_graph(ce) for ce in contexted_events] print('writing to {}'.format(output_path)) json_dump(d3_events, output_path)
def test_get_gen_cand_tree_params(): event_size = 100 participants_n = 10 event = gen_event_with_known_tree_structure( event_size=event_size, participants=range(participants_n), start_time=10, end_time=110, event_topic_param=random_topic(10, topic_noise=0.1)[0], topic_noise=1, alpha=1.0, tau=0.8, forward_proba=0.3, reply_proba=0.5, create_new_proba=0.2 ) event = IU.assign_edge_weights(event, cosine) params = get_gen_cand_tree_params(event) assert_true(params['U'] > 0) assert_equal(99, params['preprune_secs']) assert_equal([0], params['roots'])
def test_make_artificial_data(self): events, all_interactions, params = make_artificial_data(**self.params) assert_equal(self.params['n_events'], len(params)) assert_equal( self.params['n_events'], len(events) ) assert_equal( self.params['event_size_mu'] * self.params['n_events'] + self.params['n_noisy_interactions'], len(all_interactions) ) for i in all_interactions: assert_true('message_id' in i) # make sure it's jsonable assert_true(isinstance(i['topics'], list)) # all ids are unique all_ids = list(itertools.chain(*[e.nodes() for e in events])) assert_equal(len(all_ids), len(set(all_ids))) for e in events: # make sure nodes are relabeled for n in e.nodes_iter(): assert_equal(n, e.node[n]['message_id']) interactions = [e.node[n] for n in e.nodes_iter()] assert_equal(len(interactions), IU.get_meta_graph( interactions, decompose_interactions=False, remove_singleton=True, given_topics=True).number_of_nodes()) for i in interactions: assert_true(isinstance(i['topics'], list)) for i in all_interactions: assert_true(i['sender_id'].startswith('u-'))
def sample_rooted_binary_graphs_within_timespan( meta_graph_pickle_path, sample_number, timespan, output_path): g = nx.read_gpickle(meta_graph_pickle_path) roots = sample_nodes(g, sample_number) results = [] for i, r in enumerate(roots): print('done:', i) sub_g = InteractionsUtil.get_rooted_subgraph_within_timespan( g, r, timespan ) binary_sub_g = binarize_dag(sub_g, InteractionsUtil.VERTEX_REWARD_KEY, InteractionsUtil.EDGE_COST_KEY, dummy_node_name_prefix="d_") if len(binary_sub_g.edges()) > 0: results.append(binary_sub_g) pkl.dump(results, open(output_path, 'w'))
def root_and_dag(self, r): return r, IU.get_rooted_subgraph_within_timespan( self.g, r, self.timespan_secs)
def make_artificial_data( # for main events n_events, event_size_mu, event_size_sigma, participant_mu, participant_sigma, # for minor events n_minor_events, minor_event_size_mu, minor_event_size_sigma, minor_event_participant_mu, minor_event_participant_sigma, # shared n_total_participants, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, n_noisy_interactions, n_noisy_interactions_fraction, alpha, tau, forward_proba, reply_proba, create_new_proba, dist_func): events, taboo_topics = random_events( n_events, event_size_mu, event_size_sigma, n_total_participants, participant_mu, participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, accumulate_taboo=True ) minor_events, _ = random_events( n_minor_events, minor_event_size_mu, minor_event_size_sigma, n_total_participants, minor_event_participant_mu, minor_event_participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, taboo_topics=taboo_topics, accumulate_taboo=False ) (n_noisy_interactions, _) = get_number_and_percentage( sum([1 for e in events for _ in e]), n_noisy_interactions, n_noisy_interactions_fraction ) noisy_interactions = random_noisy_interactions( n_noisy_interactions, min_time, max_time, n_total_participants, n_topics, topic_noise, taboo_topics ) event_interactions = [e.node[n] for e in events for n in e.nodes_iter()] minor_event_interactions = [e.node[n] for e in minor_events for n in e.nodes_iter()] all_interactions = (event_interactions + minor_event_interactions + noisy_interactions) # add interaction id for i, intr in enumerate(all_interactions): intr['message_id'] = i intr['topics'] = intr['topics'].tolist() # relabel the nodes relabeled_events = [] for e in events: mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()} relabeled_events.append(nx.relabel_nodes(e, mapping)) for e in events: e = IU.assign_edge_weights(e, dist_func) gen_cand_trees_params = [get_gen_cand_tree_params(e) for e in events] return relabeled_events, all_interactions, gen_cand_trees_params
def random_events(n_events, event_size_mu, event_size_sigma, n_total_participants, participant_mu, participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, taboo_topics=set(), accumulate_taboo=False): # add main events events = [] taboo_topics = set(taboo_topics) for i in xrange(n_events): # randomly select a topic and add some noise to it event = [] event_topic_param, topic_id = random_topic( n_topics, topic_noise, taboo_topics ) if accumulate_taboo: taboo_topics.add(topic_id) print('event_topic_param:', event_topic_param) event_size = 0 while event_size <= 0: event_size = int(round( np.random.normal(event_size_mu, event_size_sigma) )) assert event_size > 0 # randomly select participants n_participants = 0 while n_participants <= 2: n_participants = int(round( np.random.normal(participant_mu, participant_sigma) )) assert n_participants > 2 participants = np.random.permutation( n_total_participants )[:n_participants] print('participants:', participants) # event timespan start_time = np.random.uniform(min_time, max_time - event_duration_mu) end_time = start_time + np.random.normal(event_duration_mu, event_duration_sigma) if end_time > max_time: end_time = max_time event = gen_event_with_known_tree_structure( event_size, participants, start_time, end_time, event_topic_param, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba ) # some checking g = IU.get_meta_graph( [event.node[n] for n in event.nodes_iter()], decompose_interactions=False, remove_singleton=True, given_topics=True, convert_time=False) n_interactions_in_mg = g.number_of_nodes() if n_interactions_in_mg == len(event): roots = [n for n, d in g.in_degree(g.nodes_iter()).items() if d == 0] if len(roots) > 1: print(roots) for r in roots: print(event[r]) print("WARNING: roots number {}".format(len(roots))) raise else: print( 'invalid meta graph. {} < {}'.format( n_interactions_in_mg, len(event) )) raise events.append(event) return events, taboo_topics
def setUp(self): self.interactions = IU.clean_interactions( json_load(make_path('test/data/enron_test.json')))
def main1(): C, P, T1, T2 = ('CEO', 'PM', 'T1', 'T2') p = 'progress' s = 'suggetion' f = 'football' correct_edge_to_color = { ('a', 'b'): 'red', ('b', 'c'): 'red', ('c', 'd'): 'red', ('e', 'f'): 'green' } interactions = [ ('a', C, [P], p, 1), ('b', P, [T1, T2], p, 2), ('c', T1, [P], p, 3), ('d', P, [C], p, 4), ('e', T2, [P], s, 3), ('f', P, [C], p, 5), ('g', T2, [T1], f, 4) ] new_interactions = [] for msg_id, sender, recs, topic, time in interactions: new_interactions.append( {'sender_id': sender, 'recipient_ids': recs, 'datetime': time, 'message_id': msg_id}, ) node_names, sources, targets, time_stamps = InteractionsUtil.unzip_interactions( new_interactions ) graph = convert_to_meta_graph(node_names, sources, targets, time_stamps) # nx.write_dot(graph, 'tmp/illustration.dot') print """digraph { node [fontsize=20]; """ for u, v in graph.edges(): print "{} -> {}[color={}];".format( u, v, # correct_edge_to_color.get((u, v), 'gray') 'black' ) print "}" df = pd.DataFrame(new_interactions, columns=['sender_id', 'recipient_ids', 'datetime'], index=[i[0] for i in interactions]) df = df.rename(columns={'sender_id': 'sender', 'recipient_ids': 'recipients', 'datetime': 'time'}) mapping = { 1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri' } df['time'] = df['time'].map(lambda t: mapping[t]) df.to_latex('tmp/example.tex')
def setUp(self): self.interactions = IU.clean_interactions( json_load( make_path('test/data/enron_test.json') ) )
def run(gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': {'topics': 0.2, 'bow': 0.8}, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path) ) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path) ) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix ) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied ) logger.info('pickling...') nx.write_gpickle( IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path ) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent ) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format( cand_tree_number / float(g.number_of_nodes())) ) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature( cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method ), suffix ) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path+'.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = {'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump( paths_dict, open(all_paths_pkl_path, 'w') ) return paths_dict
def run( gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': { 'topics': 0.2, 'bow': 0.8 }, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path)) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path)) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[ 'preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied) logger.info('pickling...') nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format(cand_tree_number / float(g.number_of_nodes()))) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature(cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method), suffix) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path + '.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = { 'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump(paths_dict, open(all_paths_pkl_path, 'w')) return paths_dict
def root_and_dag(self, r): return r, IU.get_rooted_subgraph_within_timespan( self.g, r, self.timespan_secs )
def random_events(n_events, event_size_mu, event_size_sigma, n_total_participants, participant_mu, participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, taboo_topics=set(), accumulate_taboo=False): # add main events events = [] taboo_topics = set(taboo_topics) for i in xrange(n_events): # randomly select a topic and add some noise to it event = [] event_topic_param, topic_id = random_topic(n_topics, topic_noise, taboo_topics) if accumulate_taboo: taboo_topics.add(topic_id) print('event_topic_param:', event_topic_param) event_size = 0 while event_size <= 0: event_size = int( round(np.random.normal(event_size_mu, event_size_sigma))) assert event_size > 0 # randomly select participants n_participants = 0 while n_participants <= 2: n_participants = int( round(np.random.normal(participant_mu, participant_sigma))) assert n_participants > 2 participants = np.random.permutation( n_total_participants)[:n_participants] print('participants:', participants) # event timespan start_time = np.random.uniform(min_time, max_time - event_duration_mu) end_time = start_time + np.random.normal(event_duration_mu, event_duration_sigma) if end_time > max_time: end_time = max_time event = gen_event_with_known_tree_structure(event_size, participants, start_time, end_time, event_topic_param, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba) # some checking g = IU.get_meta_graph([event.node[n] for n in event.nodes_iter()], decompose_interactions=False, remove_singleton=True, given_topics=True, convert_time=False) n_interactions_in_mg = g.number_of_nodes() if n_interactions_in_mg == len(event): roots = [ n for n, d in g.in_degree(g.nodes_iter()).items() if d == 0 ] if len(roots) > 1: print(roots) for r in roots: print(event[r]) print("WARNING: roots number {}".format(len(roots))) raise else: print('invalid meta graph. {} < {}'.format(n_interactions_in_mg, len(event))) raise events.append(event) return events, taboo_topics
def make_artificial_data( # for main events n_events, event_size_mu, event_size_sigma, participant_mu, participant_sigma, # for minor events n_minor_events, minor_event_size_mu, minor_event_size_sigma, minor_event_participant_mu, minor_event_participant_sigma, # shared n_total_participants, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, n_noisy_interactions, n_noisy_interactions_fraction, alpha, tau, forward_proba, reply_proba, create_new_proba, dist_func): events, taboo_topics = random_events(n_events, event_size_mu, event_size_sigma, n_total_participants, participant_mu, participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, accumulate_taboo=True) minor_events, _ = random_events(n_minor_events, minor_event_size_mu, minor_event_size_sigma, n_total_participants, minor_event_participant_mu, minor_event_participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, taboo_topics=taboo_topics, accumulate_taboo=False) (n_noisy_interactions, _) = get_number_and_percentage(sum([1 for e in events for _ in e]), n_noisy_interactions, n_noisy_interactions_fraction) noisy_interactions = random_noisy_interactions(n_noisy_interactions, min_time, max_time, n_total_participants, n_topics, topic_noise, taboo_topics) event_interactions = [e.node[n] for e in events for n in e.nodes_iter()] minor_event_interactions = [ e.node[n] for e in minor_events for n in e.nodes_iter() ] all_interactions = (event_interactions + minor_event_interactions + noisy_interactions) # add interaction id for i, intr in enumerate(all_interactions): intr['message_id'] = i intr['topics'] = intr['topics'].tolist() # relabel the nodes relabeled_events = [] for e in events: mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()} relabeled_events.append(nx.relabel_nodes(e, mapping)) for e in events: e = IU.assign_edge_weights(e, dist_func) gen_cand_trees_params = [get_gen_cand_tree_params(e) for e in events] return relabeled_events, all_interactions, gen_cand_trees_params
'bow': 0.8}, {'topics': 1.0}, {'bow': 1.0}, ] for weights in different_weights: meta_graph_kws = { 'distance_weights': weights, } g = IU.get_topic_meta_graph( interactions, lda_model=lda_model, dictionary=dictionary, undirected=False, given_topics=False, decompose_interactions=False, dist_func=cosine, preprune_secs=timedelta(weeks=4).total_seconds(), apply_pagerank=False, **meta_graph_kws ) print('weights: {}\n'.format(weights)) out_degrees = g.out_degree(g.nodes()) sorted_nodes = sorted(out_degrees, key=lambda k: out_degrees[k], reverse=True) print('\n'.join(map(lambda n: g.node[n]['subject'], sorted_nodes)[:10])) node = sorted_nodes[5]
{ 'bow': 1.0 }, ] for weights in different_weights: meta_graph_kws = { 'distance_weights': weights, } g = IU.get_topic_meta_graph( interactions, lda_model=lda_model, dictionary=dictionary, undirected=False, given_topics=False, decompose_interactions=False, dist_func=cosine, preprune_secs=timedelta(weeks=4).total_seconds(), apply_pagerank=False, **meta_graph_kws) print('weights: {}\n'.format(weights)) out_degrees = g.out_degree(g.nodes()) sorted_nodes = sorted(out_degrees, key=lambda k: out_degrees[k], reverse=True) print('\n'.join(map(lambda n: g.node[n]['subject'], sorted_nodes)[:10])) node = sorted_nodes[5]