Python extract_ts_s Beispiele, utils.extract_ts_s Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: interrogate_hccs.py Projekt: weberdc/find_hccs

def process_json_tweet(t, tweets, retweets):
    u_id = t['user']['id_str']
    urls = utils.expanded_urls_from(t)
    ot = utils.get_ot_from_rt(t)
    is_reply = utils.is_reply(t)
    t_info = {
        't_id': t['id_str'],
        'u_id': u_id,  #t['user']['id_str'],
        'u_sn': t['user']['screen_name'],
        'u_dn': t['user']['name'],
        'u_desc': t['user']['description'],
        't_ts_sec': utils.extract_ts_s(t['created_at']),
        'hashtags': utils.lowered_hashtags_from(t),
        'mentioned_ids': [m['id_str'] for m in utils.mentions_from(t)],
        'urls': urls,
        'domains': [utils.extract_domain(u, lower=True) for u in urls],
        'is_rt': ot != None,
        'retweeted_t_id': ot['id_str'] if ot else None,
        'retweeted_u_id': ot['user']['id_str'] if ot else None,
        'is_reply': is_reply,
        'replied_to_t_id':
        t['in_reply_to_status_id_str'] if is_reply else None,
        'replied_to_u_id': t['in_reply_to_user_id_str'] if is_reply else None,
        'text': utils.extract_text(t)
    }
    if u_id not in tweets: tweets[u_id] = [t_info]
    else: tweets[u_id].append(t_info)
    if t_info['is_rt'] and t_info['retweeted_t_id'] not in retweets:
        retweets[t_info['retweeted_t_id']] = {
            'user_id': t_info['retweeted_u_id'],
            'rt_text': t_info['text']
        }

Beispiel #2

0

Datei anzeigen

Datei: raw_to_csv.py Projekt: weberdc/find_hccs

def write_rows_from_tweet(csv_f, t, topic, excl_rts):
    global REPLY_COUNT  # declare that we want to change REPLY_COUNT
    try:
        ts = utils.extract_ts_s(t['created_at'])
        # dt = parser.parse(t['created_at'])
        # ts = int(calendar.timegm(dt.timetuple()))
    except TypeError as e:
        # ts = int(int(t['created_at']) / 1000 - utc_offset) #- time.timezone #+ (time.gmtime() - time.localtime()) # there's a chance this is a millseconds (twarc)
        # ts = calendar.timegm ( datetime.utcfromtimestamp(int(t['created_at']/1000.0) - utc_offset).timetuple() )
        # raise e
        ts = int(t['created_at']) / 1000
    ts = int(ts) # force it to an int
    t_id = t['id_str']
    source = utils.get_uid(t)
    if topic in ['RETWEET', 'RETWEETS', 'RT', 'RTS'] and utils.is_rt(t):
        ot = utils.get_ot_from_rt(t)
        target = utils.get_uid(ot)
        rt_id = t_id
        ot_id = utils.get_ot_from_rt(t)['id_str']
        csv_f.writerow([ts, source, target, 'RETWEET', rt_id, ot_id])
    elif topic in ['QUOTE', 'QUOTES'] and not utils.is_rt(t) and utils.is_qt(t):
        ot = t['quoted_status']
        target = utils.get_uid(ot)
        qt_id = t_id
        ot_id = ot['id_str']
        csv_f.writerow([ts, source, target, 'QUOTE', qt_id, ot_id])
    elif topic in ['REPLY', 'REPLIES']:
        target = t['in_reply_to_user_id_str']
        ot_id  = t['in_reply_to_status_id_str']
        if target and ot_id:
            csv_f.writerow([ts, source, target, 'REPLY', t_id, ot_id])
    elif topic in ['HASHTAG', 'HASHTAGS', 'ALL_HASHTAGS']:
        hashtags = utils.lowered_hashtags_from(t, include_retweet=True)
        if is_empty(hashtags):
            return
        if topic == 'ALL_HASHTAGS':
            csv_f.writerow([ts, source, ' '.join(hashtags), 'ALL_HASHTAGS', t_id])
        else:
            for ht in hashtags:
                csv_f.writerow([ts, source, ht, 'HASHTAG', t_id])
    elif topic in ['URL', 'URLS', 'POST_URL', 'POST_URLS', 'ALL_URLS', 'ALL_POST_URLS', 'DOMAIN', 'DOMAINS', 'ALL_DOMAINS']:
        for url in set(utils.expanded_urls_from(t, include_retweet=True)):
            write_url_row(csv_f, topic, url, ts, source, t_id)
    elif topic in ['MENTION', 'MENTIONS', 'ALL_MENTIONS']:
        if excl_rts and utils.is_rt(t):
            return
        mention_objs = utils.mentions_from(t, include_retweet=True)
        if is_empty(mention_objs):
            return
        if topic == 'ALL_MENTIONS':
            mentioned_ids_str = ' '.join([m['id_str'] for m in mention_objs])
            mentioned_sns_str = ' '.join([m['screen_name'] for m in mention_objs])
            csv_f.writerow([ts, source, mentioned_ids_str, 'ALL_MENTIONS', t_id, mentioned_sns_str])
        else:
            for m in mention_objs:
                csv_f.writerow([ts, source, m['id_str'], 'MENTION', t_id, m['screen_name']])
    elif topic in ['TIMESTAMP', 'TIMESTAMPS', 'TS']:
        csv_f.writerow([ts, source, t_id])

Beispiel #3

0

Datei anzeigen

 def add_edge(g, from_id, to_id, tweet_id, ts_str, int_type, **kwargs):
     add_node(g, from_id, 'USER', True)
     # g.nodes[from_id]['is_author'] = True
     add_node(g, to_id, n_type=node_type_for(int_type))
     t = utils.extract_ts_s(
         ts_str
     ) - t_0  # timestamp_2_epoch_seconds(utils.extract_ts_s(ts_str)) - t_0
     attrs = {'time_t': t, 'tweet_id': tweet_id, 'interaction': int_type}
     key = '%s %s %s in %s' % (from_id, int_type, to_id, tweet_id)
     g.add_edge(from_id, to_id, key=key, **{**attrs, **kwargs})

Beispiel #4

0

Datei anzeigen

Datei: raw_to_csv.py Projekt: weberdc/find_hccs

def write_rows_from_ira_row(csv_f, r, topic, excl_rts):
    ts = utils.extract_ts_s(r['tweet_time'], fmt=utils.IRA_TS_FORMAT)
    t_id = r['tweetid']
    source = r['userid']
    if topic in ['RETWEET', 'RETWEETS', 'RT', 'RTS'] and r['is_retweet'] == 'true':
        target = r['retweet_userid']
        rt_id = r['tweetid']
        ot_id = r['retweet_tweetid']
        csv_f.writerow([ts, source, target, 'RETWEET', rt_id, ot_id])
    elif topic in ['QUOTE', 'QUOTES'] and r['quoted_tweet_tweetid'] == 'true':
        print('QUOTE is unsupported for IRA datasets')
        sys.exit()
    elif topic in ['REPLY', 'REPLIES']:
        target = r['in_reply_to_userid']
        ot_id  = r['in_reply_to_tweetid']
        if target and ot_id:
            csv_f.writerow([ts, source, target, 'REPLY', t_id, ot_id])
    elif topic in ['HASHTAG', 'HASHTAGS', 'ALL_HASHTAGS']:
        hashtags = parse_ira_hashtags(r['hashtags'])
        if is_empty(hashtags):
            return
        if topic == 'ALL_HASHTAGS':
            csv_f.writerow([ts, source, ' '.join(hashtags), 'HASHTAG', t_id])
        else:
            for ht in hashtags:
                csv_f.writerow([ts, source, ht, 'HASHTAG', t_id])
    elif topic in ['URL', 'URLS', 'POST_URL', 'POST_URLS', 'ALL_URLS', 'ALL_POST_URLS', 'DOMAIN', 'DOMAINS', 'ALL_DOMAINS']:
        # some of the URLs in the RU-IRA dataset are a bit wonky
        urls = set(parse_ira_urls(r['urls']))
        success = True
        for u in urls:
            success = success and check_url(u)
        if success:
            for url in set(parse_ira_urls(r['urls'])):
                write_url_row(csv_f, topic, url, ts, source, t_id)
        else:
            # strip spurious commas
            write_url_row(csv_f, topic, ''.join(urls), ts, source, t_id)
    elif topic in ['MENTION', 'MENTIONS', 'ALL_MENTIONS']:
        if excl_rts and r['is_retweet'] == 'true':
            return
        mention_ids = parse_ira_mentions(r['user_mentions'])
        if is_empty(mention_ids):
            return
        if topic == 'ALL_MENTIONS':
            m_ids_str = ' '.join(mention_ids)
            csv_f.writerow([ts, source, m_ids_str, 'MENTION', t_id, m_ids_str])
        else:
            for m_id in mention_ids:
                csv_f.writerow([ts, source, m_id, 'MENTION', t_id, m_id])
    elif topic in ['TIMESTAMP', 'TIMESTAMPS', 'TS']:
        csv_f.writerow([ts, source, t_id])

Beispiel #5

0

Datei anzeigen

Datei: extract_in_conv.py Projekt: weberdc/find_hccs

def parse_ira_tweet(t, tweets):
    REPLY_KEY = 'in_reply_to_tweetid'
    is_a_reply = REPLY_KEY in t and t[REPLY_KEY]

    t_id = t['tweetid']
    tweets[t_id] = {
        'timestamp': utils.extract_ts_s(t['tweet_time'],
                                        fmt=utils.IRA_TS_FORMAT),
        'reply_id': t_id,
        'source': t['userid'],
        'interaction': 'IN_CONV',
        'target': None,
        'ot_id': None,
        'in_reply_to_t_id': t[REPLY_KEY] if is_a_reply else None,
        'in_reply_to_u_id': t['in_reply_to_userid'] if is_a_reply else None
    }

Beispiel #6

0

Datei anzeigen

Datei: extract_in_conv.py Projekt: weberdc/find_hccs

def parse_tweet_obj(t, tweets):
    REPLY_KEY = 'in_reply_to_status_id_str'
    is_a_reply = REPLY_KEY in t and t[REPLY_KEY]

    t_id = t['id_str']
    tweets[t_id] = {
        'timestamp': utils.extract_ts_s(t['created_at']),
        'reply_id': t_id,
        'source': utils.get_uid(t),
        'interaction': 'IN_CONV',
        'target': None,
        'ot_id': None,
        'in_reply_to_t_id': t[REPLY_KEY] if is_a_reply else None,
        'in_reply_to_u_id':
        t['in_reply_to_user_id_str'] if is_a_reply else None
    }

Beispiel #7

0

Datei anzeigen

def parse_tweet_obj(t, tweets):
    REPLY_KEY = 'in_reply_to_status_id_str'
    is_a_reply = REPLY_KEY in t and t[REPLY_KEY]

    t_id = t['id_str']
    tweets[t_id] = {
        'timestamp' : utils.extract_ts_s(t['created_at']),
        'reply_tid' : t_id,
        'source_uid' : utils.get_uid(t),
        'source_sn' : t['user']['screen_name'],
        'interaction' : 'IN_CONV',
        'target_tid' : None,
        'target_uid' : None,
        'orig_tid' : None,
        'in_reply_to_tid' : t[REPLY_KEY] if is_a_reply else None,
        'in_reply_to_uid' : t['in_reply_to_user_id_str'] if is_a_reply else None,
        'raw': t,
        'type': 'TWITTER'
    }

Beispiel #8

0

Datei anzeigen

Datei: interrogate_hccs.py Projekt: weberdc/find_hccs

def process_ira_tweet(t, tweets, retweets):
    u_id = t['userid']
    urls = utils.parse_ira_urls(t['urls'])
    try:
        domains = [utils.extract_domain(u, lower=True) for u in urls]
    except ValueError as e:
        # assume some junk in the 'urls' field, so treat it as one URL
        domains = [utils.extract_domain(t['urls'], lower=True)]
    is_rt = t['is_retweet'] == 'true'
    is_reply = t['in_reply_to_tweetid'] != ''
    t_info = {
        't_id': t['tweetid'],
        'u_id': u_id,
        'u_sn': t['user_screen_name'],
        'u_dn': t['user_display_name'],
        'u_desc': t['user_profile_description'],
        't_ts_sec': utils.extract_ts_s(t['tweet_time'],
                                       fmt=utils.IRA_TS_FORMAT),
        'hashtags': utils.parse_ira_hashtags(t['hashtags']),
        'mentioned_ids': utils.parse_ira_mentions(
            t['user_mentions']
        ),  # [m['id_str'] for m in utils.mentions_from(t)],
        'urls': urls,
        'domains': domains,
        'is_rt': is_rt,
        'retweeted_t_id': t['retweet_tweetid'] if is_rt else None,
        'retweeted_u_id': t['retweet_userid'] if is_rt else None,
        'is_reply': is_reply,
        'replied_to_t_id': t['in_reply_to_tweetid'] if is_reply else None,
        'replied_to_u_id': t['in_reply_to_userid'] if is_reply else None,
        'text': t['tweet_text']  # utils.extract_text(t)
    }
    if u_id not in tweets: tweets[u_id] = [t_info]
    else: tweets[u_id].append(t_info)
    if t_info['is_rt'] and t_info['retweeted_t_id'] not in retweets:
        retweets[t_info['retweeted_t_id']] = {
            'user_id': t_info['retweeted_u_id'],
            'rt_text': t_info['text']
        }

Beispiel #9

0

Datei anzeigen

def build_activity_graph(tweets,
                         t_0):  # tweets is a tweet map { tweet_id : tweet }
    first_tweet_ts_str = utils.ts_to_str(
        t_0, fmt=utils.TWITTER_TS_FORMAT)  # epoch_seconds_2_timestamp_str(t_0)
    first_tweet_ts = utils.epoch_seconds_2_ts(
        t_0)  #first_tweet_ts_str)  # parse_twitter_ts(first_tweet_ts_str)
    g = nx.MultiDiGraph(post_count=len(tweets))

    def add_node(g, n_id, n_type='USER', is_author=False):
        if n_id not in g:
            g.add_node(n_id, n_type=n_type, label=n_id, is_author=is_author)
        elif is_author:
            # g.nodes[n_id]['n_type'] = n_type
            g.nodes[n_id]['is_author'] = is_author

    def node_type_for(interaction):
        if interaction == 'HASHTAG' or interaction == 'URL':
            return interaction
        else:
            return 'USER'

    def add_edge(g, from_id, to_id, tweet_id, ts_str, int_type, **kwargs):
        add_node(g, from_id, 'USER', True)
        # g.nodes[from_id]['is_author'] = True
        add_node(g, to_id, n_type=node_type_for(int_type))
        t = utils.extract_ts_s(
            ts_str
        ) - t_0  # timestamp_2_epoch_seconds(utils.extract_ts_s(ts_str)) - t_0
        attrs = {'time_t': t, 'tweet_id': tweet_id, 'interaction': int_type}
        key = '%s %s %s in %s' % (from_id, int_type, to_id, tweet_id)
        g.add_edge(from_id, to_id, key=key, **{**attrs, **kwargs})

        # Build networks

    # edge types: REPOST, MENTION, REPLY, QUOTE, URL, HASHTAG
    observed_user_ids = set()
    for tweet_id in tweets:
        tweet = tweets[tweet_id]
        hashtags = lowered_hashtags_from(tweet)
        urls = expanded_urls_from(tweet)
        mentions = mentioned_ids_from(tweet)
        tweet_text = extract_text(tweet)
        tweet_ts = tweet['created_at']
        tweet_id = tweet['id_str']
        tweeter_id = tweet['user']['id_str']
        observed_user_ids.add(tweeter_id)

        for ht in hashtags:
            add_edge(g, tweeter_id, ht, tweet_id, tweet_ts, 'HASHTAG')
        for url in urls:
            if not embedded_extended_tweet_url(
                    tweet_id, url
            ):  # extended tweets include a URL to their extended form
                add_edge(g, tweeter_id, url, tweet_id, tweet_ts, 'URL')
        for mentioned_id in mentions:
            observed_user_ids.add(mentioned_id)
            add_edge(g, tweeter_id, mentioned_id, tweet_id, tweet_ts,
                     'MENTION')

        if 'retweeted_status' in tweet:
            retweeter = tweeter_id
            retweetee = tweet['retweeted_status']['user']['id_str']
            observed_user_ids.add(retweetee)
            add_edge(
                g,
                retweeter,
                retweetee,
                tweet_id,
                tweet_ts,
                'REPOST',
                original_tweet_id=tweet['retweeted_status']['id_str'],
                original_tweet_ts=tweet['retweeted_status']['created_at'],
                posting_delay_sec=(
                    utils.extract_ts_s(tweet['retweeted_status']['created_at'])
                    - utils.extract_ts_s(tweet_ts))  #.total_seconds()
            )
        elif 'quoted_status' in tweet and 'retweeted_status' not in tweet:
            quoter = tweeter_id
            quotee = tweet['quoted_status']['user']['id_str']
            observed_user_ids.add(quotee)
            add_edge(
                g,
                quoter,
                quotee,
                tweet_id,
                tweet_ts,
                'QUOTE',
                original_tweet_id=tweet['quoted_status']['id_str'],
                original_tweet_ts=tweet['quoted_status']['created_at'],
                posting_delay_sec=(
                    utils.extract_ts_s(tweet['quoted_status']['created_at']) -
                    utils.extract_ts_s(tweet_ts))  #.total_seconds()
            )
        elif 'in_reply_to_status_id_str' in tweet and tweet[
                'in_reply_to_status_id_str'] in tweets:
            # only consider replies that appear in the corpus
            # basic reply info
            replier = tweeter_id
            replied_to = tweet['in_reply_to_user_id_str']
            observed_user_ids.add(replied_to)

            replied_to_status = tweets[tweet['in_reply_to_status_id_str']]
            replied_to_status_ts = replied_to_status['created_at']
            posting_delay_sec = (utils.extract_ts_s(replied_to_status_ts) -
                                 utils.extract_ts_s(tweet_ts)
                                 )  #.total_seconds()
            add_edge(g,
                     replier,
                     replied_to,
                     tweet_id,
                     tweet_ts,
                     'REPLY',
                     original_tweet_id=tweet['in_reply_to_status_id_str'],
                     original_tweet_ts=replied_to_status_ts,
                     posting_delay_sec=posting_delay_sec)
            # in conversation
            if tweet['in_reply_to_status_id_str'] in tweets:
                # follow the reply chain as far as we can
                conversation_root = root_of_conversation(
                    tweet['in_reply_to_status_id_str'], tweets)
                # conversation_root MAY NOT be in the corpus - it's still a link though
                conv_root_ts = first_tweet_ts_str
                posting_delay_sec = (utils.ts_2_epoch_seconds(first_tweet_ts) -
                                     utils.extract_ts_s(tweet_ts)
                                     )  #.total_seconds()
                if conversation_root in tweets:
                    observed_user_ids.add(
                        tweets[conversation_root]['user']['id_str'])
                    conv_root_ts = tweets[conversation_root]['created_at']
                    posting_delay_sec = (utils.extract_ts_s(conv_root_ts) -
                                         utils.extract_ts_s(tweet_ts)
                                         )  #.total_seconds()
                add_edge(g,
                         replier,
                         conversation_root,
                         tweet_id,
                         tweet_ts,
                         'IN_CONVERSATION',
                         original_tweet_id=conversation_root,
                         original_tweet_ts=conv_root_ts,
                         posting_delay_sec=posting_delay_sec)
    return g

Beispiel #10

0

Datei anzeigen

                r[key] = row[key]
            users[r['node_id']] = r
            communities[r['community_id']].append(r['node_id'])
            # users[r[0]] = r

    tweets = dict([(uid, []) for uid in users.keys()])
    earliest_ts = sys.maxsize
    latest_ts = 0
    # with open(opts.tweets_file, 'r', encoding='utf-8') as f:
    f = gzip.open(opts.tweets_file,
                  'rt') if opts.tweets_file[-1] in 'zZ' else open(
                      opts.tweets_file, 'r', encoding='utf-8')
    for l in f:
        tweet = json.loads(l.strip())
        tweet['ts'] = utils.extract_ts_s(
            tweet['created_at']
        )  # timestamp_2_epoch_seconds(parse_ts(tweet['created_at']))
        if tweet['ts'] < earliest_ts: earliest_ts = tweet['ts']
        if tweet['ts'] > latest_ts: latest_ts = tweet['ts']
        user_id = tweet['user']['id_str']
        if user_id in users.keys():
            # tweet['ts'] = timestamp_2_epoch_seconds(parse_ts(tweet['created_at']))
            tweets[user_id].append(tweet)
    f.close()
    collection_period_mins = (latest_ts - earliest_ts) / 60

    user_feature_vectors = {}
    for user_id in tweets:
        tweets[user_id].sort(key=lambda t: t['ts'])
        user_feature_vectors[user_id] = build_user_feature_vector(
            user_id, tweets[user_id], collection_period_mins)

Beispiel #11

0

Datei anzeigen

    log('Found %d accounts, %d offenders in %d tweets' % (len(screen_names), len(offenders), tweet_count))

    log('Processing offenders')
    profiles = {} # id : { profile: ..., activity: [(ts, name)], changer: boolean }
    line_count = 0
    tweet_count = 0
    in_f.seek(0)  # reset file reading cursor to start of file
    for l in in_f:
        tweet_count += 1
        line_count = utils.log_row_count(line_count, DEBUG)
        t = json.loads(l)
        uid = t['user']['id_str']
        if uid in offenders:
            tweet_count += 1
            if uid not in profiles:
                profiles[uid] = { 'tweets' : [], 'names_at_time' : [] }
            pdata = profiles[uid]
            t['dcw_ts'] = utils.extract_ts_s(t['created_at'])
            pdata['tweets'].append(t)
            pdata['names_at_time'].append( (t['dcw_ts'], t['user']['screen_name']) )
    log('Extracted %d tweets by %d offenders' % (tweet_count, len(offenders)))

    with open(out_file, 'w', encoding='utf-8') as out_f:
        for uid in profiles:
            out_f.write(json.dumps(profiles[uid]))
            out_f.write('\n')

    log('\nHaving started at %s,' % STARTING_TIME)
    log('now ending at     %s' % utils.now_str())

Beispiel #12

0

Datei anzeigen

Datei: ira_stats.py Projekt: weberdc/find_hccs

#!/usr/bin/env python3

import csv
import gzip
import sys
import utils

# Trawls through the RU-IRA dataset and counts basic statistics within a
# specified time window.

if __name__ == '__main__':
    fn = sys.argv[1]
    start_ts = utils.extract_ts_s(sys.argv[2], fmt=utils.DCW_TS_FORMAT)
    end_ts = utils.extract_ts_s(sys.argv[3], fmt=utils.DCW_TS_FORMAT)

    try:
        if fn[-1] in 'zZ':
            in_f = gzip.open(fn, 'rt', encoding='utf-8')
        else:
            in_f = open(fn, 'r', encoding='utf-8')
        csv_reader = csv.DictReader(in_f)

        users = set()
        tweets = 0
        rts = 0
        row_count = 0
        for row in csv_reader:
            row_count = utils.log_row_count(row_count, True)
            ts = utils.extract_ts_s(row['tweet_time'], fmt=utils.IRA_TS_FORMAT)
            if ts < start_ts or ts > end_ts:
                continue  # may not be in timestamp order

Beispiel #13

0

Datei anzeigen

    curr_windows = []
    line_count = 0
    for i in range(len(g_files)):
        g_file = g_files[i]

        line_count = utils.log_row_count(line_count, debug=DEBUG)

        curr_windows.append(g_file)
        if i >= num_windows:
            del g_cache[curr_windows.pop(0)]

        in_g_file = os.path.join(in_dir, g_file)
        # log('Loading %s' % in_g_file)
        g_cache[g_file] = nx.read_graphml(in_g_file)

        combined_g = combine(curr_windows, g_cache, alpha, property)

        out_g_file = os.path.join(out_dir,
                                  tweak_fn(g_file, alpha_str, num_windows))
        # log('Writing %s' % out_g_file)
        if not dry_run:
            nx.write_graphml(combined_g, out_g_file)
            # e.g., lcn-20180303_000000-15m.graphml, TODO make more efficient
            fn_ts = utils.extract_ts_s(g_file[4:19], fmt=utils.DCW_TS_FORMAT)
            net_log_utils.log_g(fn_ts, combined_g,
                                combined_g.graph['post_count'], log_file,
                                dry_run)

    log('\nHaving started at %s,' % STARTING_TIME)
    log('now ending at     %s' % utils.now_str())

Beispiel #14

0

Datei anzeigen

def s_to_ts(ts_str):
    return utils.extract_ts_s(ts_str)

Beispiel #15

0

Datei anzeigen

    #     line_count = 0
    #     for l in f:
    #         line_count = utils.log_row_count(line_count, DEBUG)
    #         t = json.loads(l)
    #         accts[t['user']['id_str']] = t['user']  # keep most recent
    #         min_ts = utils.safe_min(min_ts, s_to_ts(t['created_at']))
    #         max_ts = utils.safe_max(max_ts, s_to_ts(t['created_at']))
    log(f'Opening {opts.tweets_file}')
    f = gzip.open(opts.tweets_file, 'rt', encoding='utf-8') if opts.tweets_file[-1] in 'zZ' else open(opts.tweets_file, 'r', encoding='utf-8')
    # with open(opts.tweets_file, 'r', encoding='utf-8') as f:
    csv_reader = csv.DictReader(f, delimiter=',', quotechar='"')
    for row in csv_reader: # for l in f:
        line_count = utils.log_row_count(line_count, DEBUG)
        tweet = {}  # row  # tweet = json.loads(l.strip())
        for key in row: tweet[key] = row[key]
        tweet['ts'] = utils.extract_ts_s(tweet['tweet_time'], fmt=utils.IRA_TS_FORMAT) # 'created_at' => 'tweet_time'
        min_ts = utils.safe_min(min_ts, tweet['ts'])  # s_to_ts(tweet['ts']))
        max_ts = utils.safe_max(max_ts, tweet['ts'])  # s_to_ts(tweet['ts']))
        # if tweet['ts'] < min_ts: min_ts = tweet['ts']
        # if tweet['ts'] > max_ts: max_ts = tweet['ts']
        user_id = str(tweet['userid'])  # ['user']['id_str'] => ['userid']
        accts[user_id] = update_dummy_profile(
            accts[user_id] if user_id in accts else None, tweet
        )
        # if user_id in users.keys():
        #     # tweet['ts'] = timestamp_2_epoch_seconds(parse_ts(tweet['created_at']))
        #     tweets[user_id].append(tweet)
    f.close()
    utils.eprint('')

    corpus_duration_s = max_ts - min_ts