def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--result_path')
    parser.add_argument('--interactions_path')
    parser.add_argument('--output_path')
    parser.add_argument('--non_event_sample_n', type=int)
    parser.add_argument('--k', type=int)

    args = parser.parse_args()
    result = pkl.load(open(args.result_path))
    trees = k_best_trees(result, args.k)
    df = pd.read_json(args.interactions_path)
    
    dt_format = '%Y-%m-%dT%H:%M:%S.000Z'

    data = []
    event_nodes = set()
    for i, t in enumerate(trees):
        for n in t.nodes_iter():
            event_nodes.add(n)
            # print(t.node[n]['datetime'])
            data.append(
                {
                    'series': 'event-{}'.format(i+1),
                    'datetime': t.node[n]['datetime'].strftime(dt_format)
                }
            )
    # for enron:
    df = df[df['datetime'] > dt(2000, 6, 1)]

    # for ukraine:
    # df = df[df['datetime'] > dt(2015, 2, 26)]

    # for baltimore
    # df = df[df['datetime'] >= dt(2015, 4, 27)]

    if args.non_event_sample_n:
        print df.shape
        df = df[df['message_id'].map(lambda m: m not in event_nodes)]

        df = df.sample(n=args.non_event_sample_n)
        print df.shape

    for i, r in df.iterrows():
        # print(r)
        # print(r['datetime'])
        if r['message_id'] not in event_nodes:
            data.append(
                {
                    'series': 'non-event',
                    'datetime': r['datetime'].strftime(dt_format)
                }
            )
        else:
            # print "drop"
            pass

    json.dump(data, open(args.output_path, 'w'))
Esempio n. 2
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--result_path')
    parser.add_argument('--interactions_path')
    parser.add_argument('--output_path')
    parser.add_argument('--non_event_sample_n', type=int)
    parser.add_argument('--k', type=int)

    args = parser.parse_args()
    result = pkl.load(open(args.result_path))
    trees = k_best_trees(result, args.k)
    df = pd.read_json(args.interactions_path)

    dt_format = '%Y-%m-%dT%H:%M:%S.000Z'

    data = []
    event_nodes = set()
    for i, t in enumerate(trees):
        for n in t.nodes_iter():
            event_nodes.add(n)
            # print(t.node[n]['datetime'])
            data.append({
                'series': 'event-{}'.format(i + 1),
                'datetime': t.node[n]['datetime'].strftime(dt_format)
            })
    # for enron:
    df = df[df['datetime'] > dt(2000, 6, 1)]

    # for ukraine:
    # df = df[df['datetime'] > dt(2015, 2, 26)]

    # for baltimore
    # df = df[df['datetime'] >= dt(2015, 4, 27)]

    if args.non_event_sample_n:
        print df.shape
        df = df[df['message_id'].map(lambda m: m not in event_nodes)]

        df = df.sample(n=args.non_event_sample_n)
        print df.shape

    for i, r in df.iterrows():
        # print(r)
        # print(r['datetime'])
        if r['message_id'] not in event_nodes:
            data.append({
                'series': 'non-event',
                'datetime': r['datetime'].strftime(dt_format)
            })
        else:
            # print "drop"
            pass

    json.dump(data, open(args.output_path, 'w'))
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--result_path')
    parser.add_argument('--interactions_path')
    parser.add_argument('--output_path')
    parser.add_argument('--non_event_sample_n', type=int)
    parser.add_argument('--freq')
    parser.add_argument('--k', type=int)

    args = parser.parse_args()
    result = pkl.load(open(args.result_path))
    trees = k_best_trees(result, args.k)
    try:
        df = pd.read_json(args.interactions_path)
    except ValueError:
        df = pd.read_pickle(args.interactions_path)

    # for enron:
    # df = df[df['datetime'] > dt(2000, 6, 1)]

    timestamps = df.groupby(pd.Grouper(
        key='datetime', freq=args.freq))['message_id'].count().index

    values = lambda counts: [{
        'ts': ts.value / 1000000,
        'c': counts[ts] if ts in counts else 0
    } for ts in timestamps]
    data = []
    event_nodes = set()
    for i, t in enumerate(trees):
        nids = set(t.nodes())
        event_df = df[df['message_id'].apply(lambda m: m in nids)]
        groups = event_df.groupby(pd.Grouper(key='datetime', freq=args.freq))
        counts = groups['message_id'].count()

        data.append({
            'key': 'event-{}'.format(i + 1),
            'values': values(counts)
        })

        event_nodes |= nids

    df = df[df['message_id'].map(lambda m: m not in event_nodes)]

    if args.non_event_sample_n:
        df = df.sample(n=args.non_event_sample_n)

    counts = df.groupby(pd.Grouper(key='datetime',
                                   freq=args.freq))['message_id'].count()
    data.append({'key': 'non-event', 'values': values(counts)})
    # print(data)
    json.dump(data, open(args.output_path, 'w'))
def run(cand_trees, k, summary_kws, undirected):
    interactions = summary_kws['topics']['interactions']
    mid2i = {i['message_id']: i for i in interactions}
    trees = k_best_trees(cand_trees, k)
    summaries = [MetaGraphStat(t, summary_kws).summary_dict() for t in trees]

    items = []
    groups = []
    start_times = []
    end_times = []
    added_id_count = Counter()
    counter = 0
    for group_id, (summ, t) in enumerate(zip(summaries, trees)):
        group_id += 1
        for i in t.nodes_iter():
            counter += 1
            items.append({
                'id':
                counter,
                'content': (mid2i[i]['subject'].strip()
                            if mid2i[i]['subject'] else mid2i[i]['body']),
                'start':
                format_time(mid2i[i]['datetime']),
                'group':
                group_id
            })
            added_id_count[i] += 1
        counter += 1
        items.append({
            'id': counter,
            # 'id': 'event_{}'.format(group_id),
            'start': format_time(summ['time_span']['start_time']),
            'end': format_time(summ['time_span']['end_time']),
            'content': 'Event {}'.format(group_id),
            'group': group_id,
            'type': 'background'
        })
        g = {
            'id':
            group_id,
            'terms':
            summ['topics']['topic_terms'],
            # 'terms': summ['frequent_terms'],
            # 'terms': summ['tdidf_terms'],
            'participants':
            dict(summ['participants']['participant_count']),
            'start':
            format_time(summ['time_span']['start_time']),
            'end':
            format_time(summ['time_span']['end_time']),
            'days': (summ['time_span']['end_time'] -
                     summ['time_span']['start_time']).days,
            'link_type_freq':
            summ['link_type_freq']
        }
        if 'hashtags' in summ:
            g['hashtags'] = summ['hashtags']
        groups.append(g)

        start_times.append(summ['time_span']['start_time'])
        end_times.append(summ['time_span']['end_time'])

    return {
        'items': items,
        'groups': groups,
        'start': format_time(min(start_times)),
        'end': format_time(max(end_times))
    }
def eval_setcover_obj_func(cand_trees, k):
    trees = k_best_trees(cand_trees, k)
    return len(set(n for t in trees for n in t.nodes_iter()))
        recipients = mg.node[n]['recipient_ids']
        people_network.add_edges_from([(sender, r) for r in recipients])

    print('people_network.number_of_nodes():',
          people_network.number_of_nodes())
    colors = ['red', 'blue', 'green', 'yellow', 'black', 'orange']
    colors = defaultdict(lambda: 'gray')
    for c, t in zip(colors, trees):
        for n in t.nodes_iter():
            colors[n] = c
    pos = nx.spring_layout(people_network)
    nx.draw_networkx(people_network, pos)
    nx.draw_networkx_nodes(people_network,
                           pos,
                           node_color=colors)
    plt.savefig(output_path)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--meta_graph_path')
    parser.add_argument('-r', '--result_path')
    parser.add_argument('-k', '--k', type=int)
    parser.add_argument('-o', '--output_path')

    args = parser.parse_args()

    draw(nx.read_gpickle(args.meta_graph_path),
         k_best_trees(pkl.load(open(args.result_path)), args.k),
         args.output_path)
Esempio n. 7
0
def get_meta_data_of_f1(acc_trees, true_trees, k):
    pred_trees = k_best_trees(acc_trees, k)
    pred_nodes = set([n for t in pred_trees for n in t.nodes_iter()])
    true_nodes = set([n for t in true_trees for n in t.nodes_iter()])
    correct_nodes = pred_nodes & true_nodes
    return float(len(correct_nodes)), len(pred_nodes), len(true_nodes)
Esempio n. 8
0
def k_max_setcover(acc_trees, true_trees, k):
    trees = k_best_trees(acc_trees, k)
    return len(set([n for t in trees for n in t.nodes_iter()]))
        sender = mg.node[n]['sender_id']
        recipients = mg.node[n]['recipient_ids']
        people_network.add_edges_from([(sender, r) for r in recipients])

    print('people_network.number_of_nodes():',
          people_network.number_of_nodes())
    colors = ['red', 'blue', 'green', 'yellow', 'black', 'orange']
    colors = defaultdict(lambda: 'gray')
    for c, t in zip(colors, trees):
        for n in t.nodes_iter():
            colors[n] = c
    pos = nx.spring_layout(people_network)
    nx.draw_networkx(people_network, pos)
    nx.draw_networkx_nodes(people_network, pos, node_color=colors)
    plt.savefig(output_path)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--meta_graph_path')
    parser.add_argument('-r', '--result_path')
    parser.add_argument('-k', '--k', type=int)
    parser.add_argument('-o', '--output_path')

    args = parser.parse_args()

    draw(nx.read_gpickle(args.meta_graph_path),
         k_best_trees(pkl.load(open(args.result_path)), args.k),
         args.output_path)
Esempio n. 10
0
def run(cand_trees, k, summary_kws, undirected):
    interactions = summary_kws['topics']['interactions']
    mid2i = {
        i['message_id']: i
        for i in interactions
    }
    trees = k_best_trees(cand_trees, k)
    summaries = [MetaGraphStat(t, summary_kws).summary_dict() for t in trees]

    items = []
    groups = []
    start_times = []
    end_times = []
    added_id_count = Counter()
    counter = 0
    for group_id, (summ, t) in enumerate(zip(summaries, trees)):
        group_id += 1
        for i in t.nodes_iter():
            counter += 1
            items.append({
                'id': counter,
                'content': (mid2i[i]['subject'].strip()
                            if mid2i[i]['subject'] else
                            mid2i[i]['body']),
                'start': format_time(mid2i[i]['datetime']),
                'group': group_id
            })
            added_id_count[i] += 1
        counter += 1
        items.append(
            {
                'id': counter,
                # 'id': 'event_{}'.format(group_id),
                'start': format_time(summ['time_span']['start_time']),
                'end': format_time(summ['time_span']['end_time']),
                'content': 'Event {}'.format(group_id),
                'group': group_id,
                'type': 'background'
            })
        g = {
            'id': group_id,
            'terms': summ['topics']['topic_terms'],
            # 'terms': summ['frequent_terms'],
            # 'terms': summ['tdidf_terms'],
            'participants': dict(
                summ['participants']['participant_count']
            ),
            'start': format_time(summ['time_span']['start_time']),
            'end': format_time(summ['time_span']['end_time']),
            'days': (summ['time_span']['end_time'] - summ['time_span']['start_time']).days,
            'link_type_freq': summ['link_type_freq']
        }
        if 'hashtags' in summ:
            g['hashtags'] = summ['hashtags']
        groups.append(g)

        start_times.append(summ['time_span']['start_time'])
        end_times.append(summ['time_span']['end_time'])

    return {
        'items': items,
        'groups': groups,
        'start': format_time(min(start_times)),
        'end': format_time(max(end_times))
    }
Esempio n. 11
0
def eval_setcover_obj_func(cand_trees, k):
    trees = k_best_trees(cand_trees, k)
    return len(set(n for t in trees for n in t.nodes_iter()))