def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--result_path') parser.add_argument('--interactions_path') parser.add_argument('--output_path') parser.add_argument('--non_event_sample_n', type=int) parser.add_argument('--k', type=int) args = parser.parse_args() result = pkl.load(open(args.result_path)) trees = k_best_trees(result, args.k) df = pd.read_json(args.interactions_path) dt_format = '%Y-%m-%dT%H:%M:%S.000Z' data = [] event_nodes = set() for i, t in enumerate(trees): for n in t.nodes_iter(): event_nodes.add(n) # print(t.node[n]['datetime']) data.append( { 'series': 'event-{}'.format(i+1), 'datetime': t.node[n]['datetime'].strftime(dt_format) } ) # for enron: df = df[df['datetime'] > dt(2000, 6, 1)] # for ukraine: # df = df[df['datetime'] > dt(2015, 2, 26)] # for baltimore # df = df[df['datetime'] >= dt(2015, 4, 27)] if args.non_event_sample_n: print df.shape df = df[df['message_id'].map(lambda m: m not in event_nodes)] df = df.sample(n=args.non_event_sample_n) print df.shape for i, r in df.iterrows(): # print(r) # print(r['datetime']) if r['message_id'] not in event_nodes: data.append( { 'series': 'non-event', 'datetime': r['datetime'].strftime(dt_format) } ) else: # print "drop" pass json.dump(data, open(args.output_path, 'w'))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--result_path') parser.add_argument('--interactions_path') parser.add_argument('--output_path') parser.add_argument('--non_event_sample_n', type=int) parser.add_argument('--k', type=int) args = parser.parse_args() result = pkl.load(open(args.result_path)) trees = k_best_trees(result, args.k) df = pd.read_json(args.interactions_path) dt_format = '%Y-%m-%dT%H:%M:%S.000Z' data = [] event_nodes = set() for i, t in enumerate(trees): for n in t.nodes_iter(): event_nodes.add(n) # print(t.node[n]['datetime']) data.append({ 'series': 'event-{}'.format(i + 1), 'datetime': t.node[n]['datetime'].strftime(dt_format) }) # for enron: df = df[df['datetime'] > dt(2000, 6, 1)] # for ukraine: # df = df[df['datetime'] > dt(2015, 2, 26)] # for baltimore # df = df[df['datetime'] >= dt(2015, 4, 27)] if args.non_event_sample_n: print df.shape df = df[df['message_id'].map(lambda m: m not in event_nodes)] df = df.sample(n=args.non_event_sample_n) print df.shape for i, r in df.iterrows(): # print(r) # print(r['datetime']) if r['message_id'] not in event_nodes: data.append({ 'series': 'non-event', 'datetime': r['datetime'].strftime(dt_format) }) else: # print "drop" pass json.dump(data, open(args.output_path, 'w'))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--result_path') parser.add_argument('--interactions_path') parser.add_argument('--output_path') parser.add_argument('--non_event_sample_n', type=int) parser.add_argument('--freq') parser.add_argument('--k', type=int) args = parser.parse_args() result = pkl.load(open(args.result_path)) trees = k_best_trees(result, args.k) try: df = pd.read_json(args.interactions_path) except ValueError: df = pd.read_pickle(args.interactions_path) # for enron: # df = df[df['datetime'] > dt(2000, 6, 1)] timestamps = df.groupby(pd.Grouper( key='datetime', freq=args.freq))['message_id'].count().index values = lambda counts: [{ 'ts': ts.value / 1000000, 'c': counts[ts] if ts in counts else 0 } for ts in timestamps] data = [] event_nodes = set() for i, t in enumerate(trees): nids = set(t.nodes()) event_df = df[df['message_id'].apply(lambda m: m in nids)] groups = event_df.groupby(pd.Grouper(key='datetime', freq=args.freq)) counts = groups['message_id'].count() data.append({ 'key': 'event-{}'.format(i + 1), 'values': values(counts) }) event_nodes |= nids df = df[df['message_id'].map(lambda m: m not in event_nodes)] if args.non_event_sample_n: df = df.sample(n=args.non_event_sample_n) counts = df.groupby(pd.Grouper(key='datetime', freq=args.freq))['message_id'].count() data.append({'key': 'non-event', 'values': values(counts)}) # print(data) json.dump(data, open(args.output_path, 'w'))
def run(cand_trees, k, summary_kws, undirected): interactions = summary_kws['topics']['interactions'] mid2i = {i['message_id']: i for i in interactions} trees = k_best_trees(cand_trees, k) summaries = [MetaGraphStat(t, summary_kws).summary_dict() for t in trees] items = [] groups = [] start_times = [] end_times = [] added_id_count = Counter() counter = 0 for group_id, (summ, t) in enumerate(zip(summaries, trees)): group_id += 1 for i in t.nodes_iter(): counter += 1 items.append({ 'id': counter, 'content': (mid2i[i]['subject'].strip() if mid2i[i]['subject'] else mid2i[i]['body']), 'start': format_time(mid2i[i]['datetime']), 'group': group_id }) added_id_count[i] += 1 counter += 1 items.append({ 'id': counter, # 'id': 'event_{}'.format(group_id), 'start': format_time(summ['time_span']['start_time']), 'end': format_time(summ['time_span']['end_time']), 'content': 'Event {}'.format(group_id), 'group': group_id, 'type': 'background' }) g = { 'id': group_id, 'terms': summ['topics']['topic_terms'], # 'terms': summ['frequent_terms'], # 'terms': summ['tdidf_terms'], 'participants': dict(summ['participants']['participant_count']), 'start': format_time(summ['time_span']['start_time']), 'end': format_time(summ['time_span']['end_time']), 'days': (summ['time_span']['end_time'] - summ['time_span']['start_time']).days, 'link_type_freq': summ['link_type_freq'] } if 'hashtags' in summ: g['hashtags'] = summ['hashtags'] groups.append(g) start_times.append(summ['time_span']['start_time']) end_times.append(summ['time_span']['end_time']) return { 'items': items, 'groups': groups, 'start': format_time(min(start_times)), 'end': format_time(max(end_times)) }
def eval_setcover_obj_func(cand_trees, k): trees = k_best_trees(cand_trees, k) return len(set(n for t in trees for n in t.nodes_iter()))
recipients = mg.node[n]['recipient_ids'] people_network.add_edges_from([(sender, r) for r in recipients]) print('people_network.number_of_nodes():', people_network.number_of_nodes()) colors = ['red', 'blue', 'green', 'yellow', 'black', 'orange'] colors = defaultdict(lambda: 'gray') for c, t in zip(colors, trees): for n in t.nodes_iter(): colors[n] = c pos = nx.spring_layout(people_network) nx.draw_networkx(people_network, pos) nx.draw_networkx_nodes(people_network, pos, node_color=colors) plt.savefig(output_path) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('-m', '--meta_graph_path') parser.add_argument('-r', '--result_path') parser.add_argument('-k', '--k', type=int) parser.add_argument('-o', '--output_path') args = parser.parse_args() draw(nx.read_gpickle(args.meta_graph_path), k_best_trees(pkl.load(open(args.result_path)), args.k), args.output_path)
def get_meta_data_of_f1(acc_trees, true_trees, k): pred_trees = k_best_trees(acc_trees, k) pred_nodes = set([n for t in pred_trees for n in t.nodes_iter()]) true_nodes = set([n for t in true_trees for n in t.nodes_iter()]) correct_nodes = pred_nodes & true_nodes return float(len(correct_nodes)), len(pred_nodes), len(true_nodes)
def k_max_setcover(acc_trees, true_trees, k): trees = k_best_trees(acc_trees, k) return len(set([n for t in trees for n in t.nodes_iter()]))
sender = mg.node[n]['sender_id'] recipients = mg.node[n]['recipient_ids'] people_network.add_edges_from([(sender, r) for r in recipients]) print('people_network.number_of_nodes():', people_network.number_of_nodes()) colors = ['red', 'blue', 'green', 'yellow', 'black', 'orange'] colors = defaultdict(lambda: 'gray') for c, t in zip(colors, trees): for n in t.nodes_iter(): colors[n] = c pos = nx.spring_layout(people_network) nx.draw_networkx(people_network, pos) nx.draw_networkx_nodes(people_network, pos, node_color=colors) plt.savefig(output_path) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('-m', '--meta_graph_path') parser.add_argument('-r', '--result_path') parser.add_argument('-k', '--k', type=int) parser.add_argument('-o', '--output_path') args = parser.parse_args() draw(nx.read_gpickle(args.meta_graph_path), k_best_trees(pkl.load(open(args.result_path)), args.k), args.output_path)
def run(cand_trees, k, summary_kws, undirected): interactions = summary_kws['topics']['interactions'] mid2i = { i['message_id']: i for i in interactions } trees = k_best_trees(cand_trees, k) summaries = [MetaGraphStat(t, summary_kws).summary_dict() for t in trees] items = [] groups = [] start_times = [] end_times = [] added_id_count = Counter() counter = 0 for group_id, (summ, t) in enumerate(zip(summaries, trees)): group_id += 1 for i in t.nodes_iter(): counter += 1 items.append({ 'id': counter, 'content': (mid2i[i]['subject'].strip() if mid2i[i]['subject'] else mid2i[i]['body']), 'start': format_time(mid2i[i]['datetime']), 'group': group_id }) added_id_count[i] += 1 counter += 1 items.append( { 'id': counter, # 'id': 'event_{}'.format(group_id), 'start': format_time(summ['time_span']['start_time']), 'end': format_time(summ['time_span']['end_time']), 'content': 'Event {}'.format(group_id), 'group': group_id, 'type': 'background' }) g = { 'id': group_id, 'terms': summ['topics']['topic_terms'], # 'terms': summ['frequent_terms'], # 'terms': summ['tdidf_terms'], 'participants': dict( summ['participants']['participant_count'] ), 'start': format_time(summ['time_span']['start_time']), 'end': format_time(summ['time_span']['end_time']), 'days': (summ['time_span']['end_time'] - summ['time_span']['start_time']).days, 'link_type_freq': summ['link_type_freq'] } if 'hashtags' in summ: g['hashtags'] = summ['hashtags'] groups.append(g) start_times.append(summ['time_span']['start_time']) end_times.append(summ['time_span']['end_time']) return { 'items': items, 'groups': groups, 'start': format_time(min(start_times)), 'end': format_time(max(end_times)) }