def run_with_context(interactions_path, candidate_tree_path, dirname=None, to_original_graph=False, undirected=False): if not os.path.exists(dirname): os.makedirs(dirname) try: interactions = json.load(open(interactions_path)) except ValueError as e: print(e) interactions = load_json_by_line(interactions_path) interactions = IU.clean_interactions(interactions, undirected=undirected) output_path = get_output_path(candidate_tree_path, dirname) K = 5 events = detect_events_given_path(candidate_tree_path, K) contexted_events = [] for e in events: context_dag = extract_event_context(interactions, e, undirected=undirected) if to_original_graph: context_dag = convert_to_original_graph(context_dag) e = convert_to_original_graph(e) contexted_events.append(add_subgraph_specific_attributes_to_graph(context_dag, [(e, {"event": True})])) d3_events = [to_d3_graph(ce) for ce in contexted_events] print("writing to {}".format(output_path)) json_dump(d3_events, output_path)
def clean_interaction_data(input_path, output_path): obj = load_json_by_line(input_path) df = DataFrame(obj) df['timestamp'] = df['datetime'] df['datetime'] = df['timestamp'].map( lambda ts: str(datetime.fromtimestamp(ts))) df.to_json(output_path, orient="records")
def run_with_context(interactions_path, candidate_tree_path, dirname=None, to_original_graph=False, undirected=False): if not os.path.exists(dirname): os.makedirs(dirname) try: interactions = json.load(open(interactions_path)) except ValueError as e: print(e) interactions = load_json_by_line(interactions_path) interactions = IU.clean_interactions(interactions, undirected=undirected) output_path = get_output_path(candidate_tree_path, dirname) K = 5 events = detect_events_given_path(candidate_tree_path, K) contexted_events = [] for e in events: context_dag = extract_event_context( interactions, e, undirected=undirected ) if to_original_graph: context_dag = convert_to_original_graph(context_dag) e = convert_to_original_graph(e) contexted_events.append( add_subgraph_specific_attributes_to_graph( context_dag, [(e, {'event': True})]) ) d3_events = [to_d3_graph(ce) for ce in contexted_events] print('writing to {}'.format(output_path)) json_dump(d3_events, output_path)
def run(gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': {'topics': 0.2, 'bow': 0.8}, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path) ) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path) ) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix ) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied ) logger.info('pickling...') nx.write_gpickle( IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path ) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent ) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format( cand_tree_number / float(g.number_of_nodes())) ) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature( cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method ), suffix ) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path+'.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = {'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump( paths_dict, open(all_paths_pkl_path, 'w') ) return paths_dict
import numpy as np import matplotlib # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') import matplotlib.pyplot as plt from networkx.drawing.nx_pylab import draw_spring from meta_graph import convert_to_original_graph from events import detect_events_given_path from util import load_json_by_line CURDIR = os.path.dirname(os.path.abspath(__file__)) interactions = load_json_by_line(CURDIR + '/data/enron.json') people_info = load_json_by_line(CURDIR + '/data/people.json') peopleid2info = {r['id']: (r['name'], r['email']) for r in people_info} summary_kws = { 'temporal_traffic': False, 'topics': False, 'email_content': False, 'participants': { 'people_info': people_info, 'interactions': interactions } } def draw_kws_graphs(g):
import numpy as np import matplotlib # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') import matplotlib.pyplot as plt from networkx.drawing.nx_pylab import draw_spring from meta_graph import convert_to_original_graph from events import detect_events_given_path from util import load_json_by_line CURDIR = os.path.dirname(os.path.abspath(__file__)) interactions = load_json_by_line(CURDIR + '/data/enron.json') people_info = load_json_by_line(CURDIR + '/data/people.json') peopleid2info = {r['id']: (r['name'], r['email']) for r in people_info} summary_kws = { 'temporal_traffic': False, 'topics': False, 'email_content': False, 'participants': { 'people_info': people_info, 'interactions': interactions } }
def run( gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': { 'topics': 0.2, 'bow': 0.8 }, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path)) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path)) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[ 'preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied) logger.info('pickling...') nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format(cand_tree_number / float(g.number_of_nodes()))) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature(cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method), suffix) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path + '.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = { 'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump(paths_dict, open(all_paths_pkl_path, 'w')) return paths_dict