Esempio n. 1
0
    STARTING_TIME = utils.now_str()
    log('Starting at %s\n' % STARTING_TIME)

    log('reading graph %s' % g_file)
    in_g = nx.read_graphml(g_file)

    t_file = opts.tweets_file
    log('reading tweets %s' % t_file)
    in_f = (gzip.open(t_file, 'rt', encoding='utf-8')
            if t_file[-1] in 'zZ' else open(t_file, 'r', encoding='utf-8'))
    post_counts = {}
    retweeted = {}  # ot_id : [rting_acct_ids]
    line_count = 0
    for l in in_f:
        line_count = utils.log_row_count(line_count, DEBUG)
        t = json.loads(l)
        uid = t['user']['id_str']
        if uid not in post_counts:
            post_counts[uid] = 0.0
        post_counts[uid] += 1.0
        if utils.is_rt(t):
            ot_id = utils.get_ot_from_rt(t)['id_str']
            if ot_id not in retweeted:
                retweeted[ot_id] = [uid]
            else:
                retweeted[ot_id].append(uid)
    if DEBUG: utils.eprint()
    in_f.close()

    ids = set(map(str, post_counts.keys()))
Esempio n. 2
0
    window_s = utils.parse_window_cli_arg(opts.window_str)

    STARTING_TIME = utils.now_str()
    log('Starting at %s\n' % STARTING_TIME)

    if tweets_file[-1] in 'zZ':
        in_f = gzip.open(tweets_file, 'rt', encoding='utf-8')
    else:
        in_f = open(tweets_file, 'r', encoding='utf-8')

    line_count = 0
    tweets = {}  # tweet ID : { ts, tweet ID, conv_root ID, replier ID, conv_root author ID }
    if ira:
        csv_reader = csv.DictReader(in_f)
        for row in csv_reader:
            line_count = utils.log_row_count(line_count, DEBUG)
            parse_ira_tweet(row, tweets)
    else:
        for l in in_f:
            line_count = utils.log_row_count(line_count, DEBUG)
            t = json.loads(l)
            parse_tweet_obj(t, tweets)

    post_counts = {}
    for t in tweets.values():
        if t['source_uid'] not in post_counts:
            post_counts[t['source_uid']] = 0.0
        post_counts[t['source_uid']] += 1.0

    log('')
    log('Found %d replies' % len(tweets))
Esempio n. 3
0
    fn = sys.argv[1]
    start_ts = utils.extract_ts_s(sys.argv[2], fmt=utils.DCW_TS_FORMAT)
    end_ts = utils.extract_ts_s(sys.argv[3], fmt=utils.DCW_TS_FORMAT)

    try:
        if fn[-1] in 'zZ':
            in_f = gzip.open(fn, 'rt', encoding='utf-8')
        else:
            in_f = open(fn, 'r', encoding='utf-8')
        csv_reader = csv.DictReader(in_f)

        users = set()
        tweets = 0
        rts = 0
        row_count = 0
        for row in csv_reader:
            row_count = utils.log_row_count(row_count, True)
            ts = utils.extract_ts_s(row['tweet_time'], fmt=utils.IRA_TS_FORMAT)
            if ts < start_ts or ts > end_ts:
                continue  # may not be in timestamp order
            tweets += 1
            users.add(row['userid'])
            if row['is_retweet'].lower() == 'true': rts += 1

        print('\nTweets:   %10d' % tweets)
        print('Retweets: %10d' % rts)
        print('Accounts: %10d' % len(users))

    finally:
        in_f.close()
Esempio n. 4
0
    STARTING_TIME = utils.now_str()
    log('Starting at %s' % STARTING_TIME)

    in_dir = opts.lcn_dir
    out_f = opts.out_file

    g_files = list(filter(lambda f: f.endswith('graphml'), os.listdir(in_dir)))

    uber_g = nx.Graph(post_count=0.0)

    line_count = 0
    known_reason_types = set()
    for g_file in g_files:
        g = nx.read_graphml(os.path.join(in_dir, g_file))

        ine_count = utils.log_row_count(line_count, debug=DEBUG, lines_per_dot=10, lines_per_nl=50)

        try:
            uber_g.graph['post_count'] += float(g.graph['post_count'])
        except KeyError as e:
            print(g_file)  # which file caused the issue?
            raise e

        for n, d in g.nodes(data=True):
            if n not in uber_g:
                new_d = dict([(k, d[k]) for k in d])
                new_d['post_count'] = float(new_d['post_count'])
                uber_g.add_node(n, **new_d)
            else:
                uber_g.nodes[n]['post_count'] += (d['post_count'] * 1.0)
        for u, v, d in g.edges(data=True):
Esempio n. 5
0
    alpha_str = f_to_s(alpha)

    if not dry_run:
        # https://stackoverflow.com/a/273227
        Path(in_dir).mkdir(parents=True, exist_ok=True)
        Path(out_dir).mkdir(parents=True, exist_ok=True)
        log_file = net_log_utils.open_log_file(out_dir)

    g_files = list(filter(lambda f: f.endswith('graphml'), os.listdir(in_dir)))
    g_cache = {}  # to keep loaded graphs
    curr_windows = []
    line_count = 0
    for i in range(len(g_files)):
        g_file = g_files[i]

        line_count = utils.log_row_count(line_count, debug=DEBUG)

        curr_windows.append(g_file)
        if i >= num_windows:
            del g_cache[curr_windows.pop(0)]

        in_g_file = os.path.join(in_dir, g_file)
        # log('Loading %s' % in_g_file)
        g_cache[g_file] = nx.read_graphml(in_g_file)

        combined_g = combine(curr_windows, g_cache, alpha, property)

        out_g_file = os.path.join(out_dir,
                                  tweak_fn(g_file, alpha_str, num_windows))
        # log('Writing %s' % out_g_file)
        if not dry_run: