STARTING_TIME = utils.now_str() log('Starting at %s\n' % STARTING_TIME) log('reading graph %s' % g_file) in_g = nx.read_graphml(g_file) t_file = opts.tweets_file log('reading tweets %s' % t_file) in_f = (gzip.open(t_file, 'rt', encoding='utf-8') if t_file[-1] in 'zZ' else open(t_file, 'r', encoding='utf-8')) post_counts = {} retweeted = {} # ot_id : [rting_acct_ids] line_count = 0 for l in in_f: line_count = utils.log_row_count(line_count, DEBUG) t = json.loads(l) uid = t['user']['id_str'] if uid not in post_counts: post_counts[uid] = 0.0 post_counts[uid] += 1.0 if utils.is_rt(t): ot_id = utils.get_ot_from_rt(t)['id_str'] if ot_id not in retweeted: retweeted[ot_id] = [uid] else: retweeted[ot_id].append(uid) if DEBUG: utils.eprint() in_f.close() ids = set(map(str, post_counts.keys()))
window_s = utils.parse_window_cli_arg(opts.window_str) STARTING_TIME = utils.now_str() log('Starting at %s\n' % STARTING_TIME) if tweets_file[-1] in 'zZ': in_f = gzip.open(tweets_file, 'rt', encoding='utf-8') else: in_f = open(tweets_file, 'r', encoding='utf-8') line_count = 0 tweets = {} # tweet ID : { ts, tweet ID, conv_root ID, replier ID, conv_root author ID } if ira: csv_reader = csv.DictReader(in_f) for row in csv_reader: line_count = utils.log_row_count(line_count, DEBUG) parse_ira_tweet(row, tweets) else: for l in in_f: line_count = utils.log_row_count(line_count, DEBUG) t = json.loads(l) parse_tweet_obj(t, tweets) post_counts = {} for t in tweets.values(): if t['source_uid'] not in post_counts: post_counts[t['source_uid']] = 0.0 post_counts[t['source_uid']] += 1.0 log('') log('Found %d replies' % len(tweets))
fn = sys.argv[1] start_ts = utils.extract_ts_s(sys.argv[2], fmt=utils.DCW_TS_FORMAT) end_ts = utils.extract_ts_s(sys.argv[3], fmt=utils.DCW_TS_FORMAT) try: if fn[-1] in 'zZ': in_f = gzip.open(fn, 'rt', encoding='utf-8') else: in_f = open(fn, 'r', encoding='utf-8') csv_reader = csv.DictReader(in_f) users = set() tweets = 0 rts = 0 row_count = 0 for row in csv_reader: row_count = utils.log_row_count(row_count, True) ts = utils.extract_ts_s(row['tweet_time'], fmt=utils.IRA_TS_FORMAT) if ts < start_ts or ts > end_ts: continue # may not be in timestamp order tweets += 1 users.add(row['userid']) if row['is_retweet'].lower() == 'true': rts += 1 print('\nTweets: %10d' % tweets) print('Retweets: %10d' % rts) print('Accounts: %10d' % len(users)) finally: in_f.close()
STARTING_TIME = utils.now_str() log('Starting at %s' % STARTING_TIME) in_dir = opts.lcn_dir out_f = opts.out_file g_files = list(filter(lambda f: f.endswith('graphml'), os.listdir(in_dir))) uber_g = nx.Graph(post_count=0.0) line_count = 0 known_reason_types = set() for g_file in g_files: g = nx.read_graphml(os.path.join(in_dir, g_file)) ine_count = utils.log_row_count(line_count, debug=DEBUG, lines_per_dot=10, lines_per_nl=50) try: uber_g.graph['post_count'] += float(g.graph['post_count']) except KeyError as e: print(g_file) # which file caused the issue? raise e for n, d in g.nodes(data=True): if n not in uber_g: new_d = dict([(k, d[k]) for k in d]) new_d['post_count'] = float(new_d['post_count']) uber_g.add_node(n, **new_d) else: uber_g.nodes[n]['post_count'] += (d['post_count'] * 1.0) for u, v, d in g.edges(data=True):
alpha_str = f_to_s(alpha) if not dry_run: # https://stackoverflow.com/a/273227 Path(in_dir).mkdir(parents=True, exist_ok=True) Path(out_dir).mkdir(parents=True, exist_ok=True) log_file = net_log_utils.open_log_file(out_dir) g_files = list(filter(lambda f: f.endswith('graphml'), os.listdir(in_dir))) g_cache = {} # to keep loaded graphs curr_windows = [] line_count = 0 for i in range(len(g_files)): g_file = g_files[i] line_count = utils.log_row_count(line_count, debug=DEBUG) curr_windows.append(g_file) if i >= num_windows: del g_cache[curr_windows.pop(0)] in_g_file = os.path.join(in_dir, g_file) # log('Loading %s' % in_g_file) g_cache[g_file] = nx.read_graphml(in_g_file) combined_g = combine(curr_windows, g_cache, alpha, property) out_g_file = os.path.join(out_dir, tweak_fn(g_file, alpha_str, num_windows)) # log('Writing %s' % out_g_file) if not dry_run: