def process_json_tweet(t, tweets, retweets): u_id = t['user']['id_str'] urls = utils.expanded_urls_from(t) ot = utils.get_ot_from_rt(t) is_reply = utils.is_reply(t) t_info = { 't_id': t['id_str'], 'u_id': u_id, #t['user']['id_str'], 'u_sn': t['user']['screen_name'], 'u_dn': t['user']['name'], 'u_desc': t['user']['description'], 't_ts_sec': utils.extract_ts_s(t['created_at']), 'hashtags': utils.lowered_hashtags_from(t), 'mentioned_ids': [m['id_str'] for m in utils.mentions_from(t)], 'urls': urls, 'domains': [utils.extract_domain(u, lower=True) for u in urls], 'is_rt': ot != None, 'retweeted_t_id': ot['id_str'] if ot else None, 'retweeted_u_id': ot['user']['id_str'] if ot else None, 'is_reply': is_reply, 'replied_to_t_id': t['in_reply_to_status_id_str'] if is_reply else None, 'replied_to_u_id': t['in_reply_to_user_id_str'] if is_reply else None, 'text': utils.extract_text(t) } if u_id not in tweets: tweets[u_id] = [t_info] else: tweets[u_id].append(t_info) if t_info['is_rt'] and t_info['retweeted_t_id'] not in retweets: retweets[t_info['retweeted_t_id']] = { 'user_id': t_info['retweeted_u_id'], 'rt_text': t_info['text'] }
def write_rows_from_tweet(csv_f, t, topic, excl_rts): global REPLY_COUNT # declare that we want to change REPLY_COUNT try: ts = utils.extract_ts_s(t['created_at']) # dt = parser.parse(t['created_at']) # ts = int(calendar.timegm(dt.timetuple())) except TypeError as e: # ts = int(int(t['created_at']) / 1000 - utc_offset) #- time.timezone #+ (time.gmtime() - time.localtime()) # there's a chance this is a millseconds (twarc) # ts = calendar.timegm ( datetime.utcfromtimestamp(int(t['created_at']/1000.0) - utc_offset).timetuple() ) # raise e ts = int(t['created_at']) / 1000 ts = int(ts) # force it to an int t_id = t['id_str'] source = utils.get_uid(t) if topic in ['RETWEET', 'RETWEETS', 'RT', 'RTS'] and utils.is_rt(t): ot = utils.get_ot_from_rt(t) target = utils.get_uid(ot) rt_id = t_id ot_id = utils.get_ot_from_rt(t)['id_str'] csv_f.writerow([ts, source, target, 'RETWEET', rt_id, ot_id]) elif topic in ['QUOTE', 'QUOTES'] and not utils.is_rt(t) and utils.is_qt(t): ot = t['quoted_status'] target = utils.get_uid(ot) qt_id = t_id ot_id = ot['id_str'] csv_f.writerow([ts, source, target, 'QUOTE', qt_id, ot_id]) elif topic in ['REPLY', 'REPLIES']: target = t['in_reply_to_user_id_str'] ot_id = t['in_reply_to_status_id_str'] if target and ot_id: csv_f.writerow([ts, source, target, 'REPLY', t_id, ot_id]) elif topic in ['HASHTAG', 'HASHTAGS', 'ALL_HASHTAGS']: hashtags = utils.lowered_hashtags_from(t, include_retweet=True) if is_empty(hashtags): return if topic == 'ALL_HASHTAGS': csv_f.writerow([ts, source, ' '.join(hashtags), 'ALL_HASHTAGS', t_id]) else: for ht in hashtags: csv_f.writerow([ts, source, ht, 'HASHTAG', t_id]) elif topic in ['URL', 'URLS', 'POST_URL', 'POST_URLS', 'ALL_URLS', 'ALL_POST_URLS', 'DOMAIN', 'DOMAINS', 'ALL_DOMAINS']: for url in set(utils.expanded_urls_from(t, include_retweet=True)): write_url_row(csv_f, topic, url, ts, source, t_id) elif topic in ['MENTION', 'MENTIONS', 'ALL_MENTIONS']: if excl_rts and utils.is_rt(t): return mention_objs = utils.mentions_from(t, include_retweet=True) if is_empty(mention_objs): return if topic == 'ALL_MENTIONS': mentioned_ids_str = ' '.join([m['id_str'] for m in mention_objs]) mentioned_sns_str = ' '.join([m['screen_name'] for m in mention_objs]) csv_f.writerow([ts, source, mentioned_ids_str, 'ALL_MENTIONS', t_id, mentioned_sns_str]) else: for m in mention_objs: csv_f.writerow([ts, source, m['id_str'], 'MENTION', t_id, m['screen_name']]) elif topic in ['TIMESTAMP', 'TIMESTAMPS', 'TS']: csv_f.writerow([ts, source, t_id])
def add_edge(g, from_id, to_id, tweet_id, ts_str, int_type, **kwargs): add_node(g, from_id, 'USER', True) # g.nodes[from_id]['is_author'] = True add_node(g, to_id, n_type=node_type_for(int_type)) t = utils.extract_ts_s( ts_str ) - t_0 # timestamp_2_epoch_seconds(utils.extract_ts_s(ts_str)) - t_0 attrs = {'time_t': t, 'tweet_id': tweet_id, 'interaction': int_type} key = '%s %s %s in %s' % (from_id, int_type, to_id, tweet_id) g.add_edge(from_id, to_id, key=key, **{**attrs, **kwargs})
def write_rows_from_ira_row(csv_f, r, topic, excl_rts): ts = utils.extract_ts_s(r['tweet_time'], fmt=utils.IRA_TS_FORMAT) t_id = r['tweetid'] source = r['userid'] if topic in ['RETWEET', 'RETWEETS', 'RT', 'RTS'] and r['is_retweet'] == 'true': target = r['retweet_userid'] rt_id = r['tweetid'] ot_id = r['retweet_tweetid'] csv_f.writerow([ts, source, target, 'RETWEET', rt_id, ot_id]) elif topic in ['QUOTE', 'QUOTES'] and r['quoted_tweet_tweetid'] == 'true': print('QUOTE is unsupported for IRA datasets') sys.exit() elif topic in ['REPLY', 'REPLIES']: target = r['in_reply_to_userid'] ot_id = r['in_reply_to_tweetid'] if target and ot_id: csv_f.writerow([ts, source, target, 'REPLY', t_id, ot_id]) elif topic in ['HASHTAG', 'HASHTAGS', 'ALL_HASHTAGS']: hashtags = parse_ira_hashtags(r['hashtags']) if is_empty(hashtags): return if topic == 'ALL_HASHTAGS': csv_f.writerow([ts, source, ' '.join(hashtags), 'HASHTAG', t_id]) else: for ht in hashtags: csv_f.writerow([ts, source, ht, 'HASHTAG', t_id]) elif topic in ['URL', 'URLS', 'POST_URL', 'POST_URLS', 'ALL_URLS', 'ALL_POST_URLS', 'DOMAIN', 'DOMAINS', 'ALL_DOMAINS']: # some of the URLs in the RU-IRA dataset are a bit wonky urls = set(parse_ira_urls(r['urls'])) success = True for u in urls: success = success and check_url(u) if success: for url in set(parse_ira_urls(r['urls'])): write_url_row(csv_f, topic, url, ts, source, t_id) else: # strip spurious commas write_url_row(csv_f, topic, ''.join(urls), ts, source, t_id) elif topic in ['MENTION', 'MENTIONS', 'ALL_MENTIONS']: if excl_rts and r['is_retweet'] == 'true': return mention_ids = parse_ira_mentions(r['user_mentions']) if is_empty(mention_ids): return if topic == 'ALL_MENTIONS': m_ids_str = ' '.join(mention_ids) csv_f.writerow([ts, source, m_ids_str, 'MENTION', t_id, m_ids_str]) else: for m_id in mention_ids: csv_f.writerow([ts, source, m_id, 'MENTION', t_id, m_id]) elif topic in ['TIMESTAMP', 'TIMESTAMPS', 'TS']: csv_f.writerow([ts, source, t_id])
def parse_ira_tweet(t, tweets): REPLY_KEY = 'in_reply_to_tweetid' is_a_reply = REPLY_KEY in t and t[REPLY_KEY] t_id = t['tweetid'] tweets[t_id] = { 'timestamp': utils.extract_ts_s(t['tweet_time'], fmt=utils.IRA_TS_FORMAT), 'reply_id': t_id, 'source': t['userid'], 'interaction': 'IN_CONV', 'target': None, 'ot_id': None, 'in_reply_to_t_id': t[REPLY_KEY] if is_a_reply else None, 'in_reply_to_u_id': t['in_reply_to_userid'] if is_a_reply else None }
def parse_tweet_obj(t, tweets): REPLY_KEY = 'in_reply_to_status_id_str' is_a_reply = REPLY_KEY in t and t[REPLY_KEY] t_id = t['id_str'] tweets[t_id] = { 'timestamp': utils.extract_ts_s(t['created_at']), 'reply_id': t_id, 'source': utils.get_uid(t), 'interaction': 'IN_CONV', 'target': None, 'ot_id': None, 'in_reply_to_t_id': t[REPLY_KEY] if is_a_reply else None, 'in_reply_to_u_id': t['in_reply_to_user_id_str'] if is_a_reply else None }
def parse_tweet_obj(t, tweets): REPLY_KEY = 'in_reply_to_status_id_str' is_a_reply = REPLY_KEY in t and t[REPLY_KEY] t_id = t['id_str'] tweets[t_id] = { 'timestamp' : utils.extract_ts_s(t['created_at']), 'reply_tid' : t_id, 'source_uid' : utils.get_uid(t), 'source_sn' : t['user']['screen_name'], 'interaction' : 'IN_CONV', 'target_tid' : None, 'target_uid' : None, 'orig_tid' : None, 'in_reply_to_tid' : t[REPLY_KEY] if is_a_reply else None, 'in_reply_to_uid' : t['in_reply_to_user_id_str'] if is_a_reply else None, 'raw': t, 'type': 'TWITTER' }
def process_ira_tweet(t, tweets, retweets): u_id = t['userid'] urls = utils.parse_ira_urls(t['urls']) try: domains = [utils.extract_domain(u, lower=True) for u in urls] except ValueError as e: # assume some junk in the 'urls' field, so treat it as one URL domains = [utils.extract_domain(t['urls'], lower=True)] is_rt = t['is_retweet'] == 'true' is_reply = t['in_reply_to_tweetid'] != '' t_info = { 't_id': t['tweetid'], 'u_id': u_id, 'u_sn': t['user_screen_name'], 'u_dn': t['user_display_name'], 'u_desc': t['user_profile_description'], 't_ts_sec': utils.extract_ts_s(t['tweet_time'], fmt=utils.IRA_TS_FORMAT), 'hashtags': utils.parse_ira_hashtags(t['hashtags']), 'mentioned_ids': utils.parse_ira_mentions( t['user_mentions'] ), # [m['id_str'] for m in utils.mentions_from(t)], 'urls': urls, 'domains': domains, 'is_rt': is_rt, 'retweeted_t_id': t['retweet_tweetid'] if is_rt else None, 'retweeted_u_id': t['retweet_userid'] if is_rt else None, 'is_reply': is_reply, 'replied_to_t_id': t['in_reply_to_tweetid'] if is_reply else None, 'replied_to_u_id': t['in_reply_to_userid'] if is_reply else None, 'text': t['tweet_text'] # utils.extract_text(t) } if u_id not in tweets: tweets[u_id] = [t_info] else: tweets[u_id].append(t_info) if t_info['is_rt'] and t_info['retweeted_t_id'] not in retweets: retweets[t_info['retweeted_t_id']] = { 'user_id': t_info['retweeted_u_id'], 'rt_text': t_info['text'] }
def build_activity_graph(tweets, t_0): # tweets is a tweet map { tweet_id : tweet } first_tweet_ts_str = utils.ts_to_str( t_0, fmt=utils.TWITTER_TS_FORMAT) # epoch_seconds_2_timestamp_str(t_0) first_tweet_ts = utils.epoch_seconds_2_ts( t_0) #first_tweet_ts_str) # parse_twitter_ts(first_tweet_ts_str) g = nx.MultiDiGraph(post_count=len(tweets)) def add_node(g, n_id, n_type='USER', is_author=False): if n_id not in g: g.add_node(n_id, n_type=n_type, label=n_id, is_author=is_author) elif is_author: # g.nodes[n_id]['n_type'] = n_type g.nodes[n_id]['is_author'] = is_author def node_type_for(interaction): if interaction == 'HASHTAG' or interaction == 'URL': return interaction else: return 'USER' def add_edge(g, from_id, to_id, tweet_id, ts_str, int_type, **kwargs): add_node(g, from_id, 'USER', True) # g.nodes[from_id]['is_author'] = True add_node(g, to_id, n_type=node_type_for(int_type)) t = utils.extract_ts_s( ts_str ) - t_0 # timestamp_2_epoch_seconds(utils.extract_ts_s(ts_str)) - t_0 attrs = {'time_t': t, 'tweet_id': tweet_id, 'interaction': int_type} key = '%s %s %s in %s' % (from_id, int_type, to_id, tweet_id) g.add_edge(from_id, to_id, key=key, **{**attrs, **kwargs}) # Build networks # edge types: REPOST, MENTION, REPLY, QUOTE, URL, HASHTAG observed_user_ids = set() for tweet_id in tweets: tweet = tweets[tweet_id] hashtags = lowered_hashtags_from(tweet) urls = expanded_urls_from(tweet) mentions = mentioned_ids_from(tweet) tweet_text = extract_text(tweet) tweet_ts = tweet['created_at'] tweet_id = tweet['id_str'] tweeter_id = tweet['user']['id_str'] observed_user_ids.add(tweeter_id) for ht in hashtags: add_edge(g, tweeter_id, ht, tweet_id, tweet_ts, 'HASHTAG') for url in urls: if not embedded_extended_tweet_url( tweet_id, url ): # extended tweets include a URL to their extended form add_edge(g, tweeter_id, url, tweet_id, tweet_ts, 'URL') for mentioned_id in mentions: observed_user_ids.add(mentioned_id) add_edge(g, tweeter_id, mentioned_id, tweet_id, tweet_ts, 'MENTION') if 'retweeted_status' in tweet: retweeter = tweeter_id retweetee = tweet['retweeted_status']['user']['id_str'] observed_user_ids.add(retweetee) add_edge( g, retweeter, retweetee, tweet_id, tweet_ts, 'REPOST', original_tweet_id=tweet['retweeted_status']['id_str'], original_tweet_ts=tweet['retweeted_status']['created_at'], posting_delay_sec=( utils.extract_ts_s(tweet['retweeted_status']['created_at']) - utils.extract_ts_s(tweet_ts)) #.total_seconds() ) elif 'quoted_status' in tweet and 'retweeted_status' not in tweet: quoter = tweeter_id quotee = tweet['quoted_status']['user']['id_str'] observed_user_ids.add(quotee) add_edge( g, quoter, quotee, tweet_id, tweet_ts, 'QUOTE', original_tweet_id=tweet['quoted_status']['id_str'], original_tweet_ts=tweet['quoted_status']['created_at'], posting_delay_sec=( utils.extract_ts_s(tweet['quoted_status']['created_at']) - utils.extract_ts_s(tweet_ts)) #.total_seconds() ) elif 'in_reply_to_status_id_str' in tweet and tweet[ 'in_reply_to_status_id_str'] in tweets: # only consider replies that appear in the corpus # basic reply info replier = tweeter_id replied_to = tweet['in_reply_to_user_id_str'] observed_user_ids.add(replied_to) replied_to_status = tweets[tweet['in_reply_to_status_id_str']] replied_to_status_ts = replied_to_status['created_at'] posting_delay_sec = (utils.extract_ts_s(replied_to_status_ts) - utils.extract_ts_s(tweet_ts) ) #.total_seconds() add_edge(g, replier, replied_to, tweet_id, tweet_ts, 'REPLY', original_tweet_id=tweet['in_reply_to_status_id_str'], original_tweet_ts=replied_to_status_ts, posting_delay_sec=posting_delay_sec) # in conversation if tweet['in_reply_to_status_id_str'] in tweets: # follow the reply chain as far as we can conversation_root = root_of_conversation( tweet['in_reply_to_status_id_str'], tweets) # conversation_root MAY NOT be in the corpus - it's still a link though conv_root_ts = first_tweet_ts_str posting_delay_sec = (utils.ts_2_epoch_seconds(first_tweet_ts) - utils.extract_ts_s(tweet_ts) ) #.total_seconds() if conversation_root in tweets: observed_user_ids.add( tweets[conversation_root]['user']['id_str']) conv_root_ts = tweets[conversation_root]['created_at'] posting_delay_sec = (utils.extract_ts_s(conv_root_ts) - utils.extract_ts_s(tweet_ts) ) #.total_seconds() add_edge(g, replier, conversation_root, tweet_id, tweet_ts, 'IN_CONVERSATION', original_tweet_id=conversation_root, original_tweet_ts=conv_root_ts, posting_delay_sec=posting_delay_sec) return g
r[key] = row[key] users[r['node_id']] = r communities[r['community_id']].append(r['node_id']) # users[r[0]] = r tweets = dict([(uid, []) for uid in users.keys()]) earliest_ts = sys.maxsize latest_ts = 0 # with open(opts.tweets_file, 'r', encoding='utf-8') as f: f = gzip.open(opts.tweets_file, 'rt') if opts.tweets_file[-1] in 'zZ' else open( opts.tweets_file, 'r', encoding='utf-8') for l in f: tweet = json.loads(l.strip()) tweet['ts'] = utils.extract_ts_s( tweet['created_at'] ) # timestamp_2_epoch_seconds(parse_ts(tweet['created_at'])) if tweet['ts'] < earliest_ts: earliest_ts = tweet['ts'] if tweet['ts'] > latest_ts: latest_ts = tweet['ts'] user_id = tweet['user']['id_str'] if user_id in users.keys(): # tweet['ts'] = timestamp_2_epoch_seconds(parse_ts(tweet['created_at'])) tweets[user_id].append(tweet) f.close() collection_period_mins = (latest_ts - earliest_ts) / 60 user_feature_vectors = {} for user_id in tweets: tweets[user_id].sort(key=lambda t: t['ts']) user_feature_vectors[user_id] = build_user_feature_vector( user_id, tweets[user_id], collection_period_mins)
log('Found %d accounts, %d offenders in %d tweets' % (len(screen_names), len(offenders), tweet_count)) log('Processing offenders') profiles = {} # id : { profile: ..., activity: [(ts, name)], changer: boolean } line_count = 0 tweet_count = 0 in_f.seek(0) # reset file reading cursor to start of file for l in in_f: tweet_count += 1 line_count = utils.log_row_count(line_count, DEBUG) t = json.loads(l) uid = t['user']['id_str'] if uid in offenders: tweet_count += 1 if uid not in profiles: profiles[uid] = { 'tweets' : [], 'names_at_time' : [] } pdata = profiles[uid] t['dcw_ts'] = utils.extract_ts_s(t['created_at']) pdata['tweets'].append(t) pdata['names_at_time'].append( (t['dcw_ts'], t['user']['screen_name']) ) log('Extracted %d tweets by %d offenders' % (tweet_count, len(offenders))) with open(out_file, 'w', encoding='utf-8') as out_f: for uid in profiles: out_f.write(json.dumps(profiles[uid])) out_f.write('\n') log('\nHaving started at %s,' % STARTING_TIME) log('now ending at %s' % utils.now_str())
#!/usr/bin/env python3 import csv import gzip import sys import utils # Trawls through the RU-IRA dataset and counts basic statistics within a # specified time window. if __name__ == '__main__': fn = sys.argv[1] start_ts = utils.extract_ts_s(sys.argv[2], fmt=utils.DCW_TS_FORMAT) end_ts = utils.extract_ts_s(sys.argv[3], fmt=utils.DCW_TS_FORMAT) try: if fn[-1] in 'zZ': in_f = gzip.open(fn, 'rt', encoding='utf-8') else: in_f = open(fn, 'r', encoding='utf-8') csv_reader = csv.DictReader(in_f) users = set() tweets = 0 rts = 0 row_count = 0 for row in csv_reader: row_count = utils.log_row_count(row_count, True) ts = utils.extract_ts_s(row['tweet_time'], fmt=utils.IRA_TS_FORMAT) if ts < start_ts or ts > end_ts: continue # may not be in timestamp order
curr_windows = [] line_count = 0 for i in range(len(g_files)): g_file = g_files[i] line_count = utils.log_row_count(line_count, debug=DEBUG) curr_windows.append(g_file) if i >= num_windows: del g_cache[curr_windows.pop(0)] in_g_file = os.path.join(in_dir, g_file) # log('Loading %s' % in_g_file) g_cache[g_file] = nx.read_graphml(in_g_file) combined_g = combine(curr_windows, g_cache, alpha, property) out_g_file = os.path.join(out_dir, tweak_fn(g_file, alpha_str, num_windows)) # log('Writing %s' % out_g_file) if not dry_run: nx.write_graphml(combined_g, out_g_file) # e.g., lcn-20180303_000000-15m.graphml, TODO make more efficient fn_ts = utils.extract_ts_s(g_file[4:19], fmt=utils.DCW_TS_FORMAT) net_log_utils.log_g(fn_ts, combined_g, combined_g.graph['post_count'], log_file, dry_run) log('\nHaving started at %s,' % STARTING_TIME) log('now ending at %s' % utils.now_str())
def s_to_ts(ts_str): return utils.extract_ts_s(ts_str)
# line_count = 0 # for l in f: # line_count = utils.log_row_count(line_count, DEBUG) # t = json.loads(l) # accts[t['user']['id_str']] = t['user'] # keep most recent # min_ts = utils.safe_min(min_ts, s_to_ts(t['created_at'])) # max_ts = utils.safe_max(max_ts, s_to_ts(t['created_at'])) log(f'Opening {opts.tweets_file}') f = gzip.open(opts.tweets_file, 'rt', encoding='utf-8') if opts.tweets_file[-1] in 'zZ' else open(opts.tweets_file, 'r', encoding='utf-8') # with open(opts.tweets_file, 'r', encoding='utf-8') as f: csv_reader = csv.DictReader(f, delimiter=',', quotechar='"') for row in csv_reader: # for l in f: line_count = utils.log_row_count(line_count, DEBUG) tweet = {} # row # tweet = json.loads(l.strip()) for key in row: tweet[key] = row[key] tweet['ts'] = utils.extract_ts_s(tweet['tweet_time'], fmt=utils.IRA_TS_FORMAT) # 'created_at' => 'tweet_time' min_ts = utils.safe_min(min_ts, tweet['ts']) # s_to_ts(tweet['ts'])) max_ts = utils.safe_max(max_ts, tweet['ts']) # s_to_ts(tweet['ts'])) # if tweet['ts'] < min_ts: min_ts = tweet['ts'] # if tweet['ts'] > max_ts: max_ts = tweet['ts'] user_id = str(tweet['userid']) # ['user']['id_str'] => ['userid'] accts[user_id] = update_dummy_profile( accts[user_id] if user_id in accts else None, tweet ) # if user_id in users.keys(): # # tweet['ts'] = timestamp_2_epoch_seconds(parse_ts(tweet['created_at'])) # tweets[user_id].append(tweet) f.close() utils.eprint('') corpus_duration_s = max_ts - min_ts