def top_retweeted_users(): Bot = bot.load_bot() dir_name = "RetweetNew/" files = os.listdir(dir_name) tweet_num = 0 users = {} cascade = {} one_cascade = {} all_retweet = {} all_retweet_num = 0 for postid in files: with open(dir_name + postid, 'r') as f: tweets = json.load(f) for tweet in tweets.values(): user = tweet['user'] origin = tweet['origin_tweet'] cascade[origin] = 1 if tweet['cascade'] == 1: one_cascade[origin] = 1 if bot.check_bot(Bot, user) == 0: users[user] = users.get(user, 0) + tweet['child'] tweet_num += 1 all_retweet_num += tweet['child'] print('all users ' ,len(users)) print('all tweets ' , tweet_num) print('all cascades ' , len(cascade)) print('one cascade ', len(one_cascade)) print('all retweet num ' , all_retweet_num) with open('Data/top_retweeted_users', 'w') as f: json.dump(users, f)
def top_participated_users(users): Bot = bot.load_bot() for key in user_participation: user_participation[key] #postids sort = sorted(user_participation, key = lambda k : len(user_participation[k]), reverse=True) top_100 = [] top_0_1 = [] top_1 = [] for i, item in enumerate(sort): #print(item, screen_name(item), len(user_participation[item]), bot.check_bot(Bot, item)) if bot.check_bot(Bot, item) == 0: if i < 200: top_0_1.append(item) if i < 2000: top_1.append(item) else: print('top 1% ', len(user_participation[item])) break if i < 100: top_100.append(item)
def political_alignment_pearson(): with open('Data/user_content_polarity.json', 'r') as f: content_polarity = json.load(f) echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json') files = os.listdir('RetweetNew') e_user = {} ne_user = {} e_source = {} ne_source = {} Bot = bot.load_bot() for ccc, postid in enumerate(files): with open('RetweetNew/' + postid, 'r') as f: tweets = json.load(f) print(ccc, postid, len(tweets)) echo_users = echo_chamber_users[postid] for tweet in tweets.values(): user = tweet['user'] if bot.check_bot(Bot, user) == 1: continue if e_user.get(user, None) != None or ne_user.get(user, None) != None: continue user_politic_score = round(get_polarity(user),4) content_politic_score = content_polarity.get(user, None) if user_politic_score != None and content_politic_score != None: if user in echo_users: e_user[user] = user_politic_score e_source[user] = content_politic_score else: ne_user[user] = user_politic_score ne_source[user] = content_politic_score #if ccc == 10: # break # break e_keys = e_user.keys() ne_keys = ne_user.keys() #print('echo', stats.pearsonr(e_user.values(), e_source.values())) #print('necho', stats.pearsonr(ne_user.values(), ne_source.values())) print('echo', stats.pearsonr([e_user[key] for key in e_keys], [e_source[key] for key in e_keys])) print('necho', stats.pearsonr([ne_user[key] for key in ne_keys], [ne_source[key] for key in ne_keys])) with open('Data/user_polarity_content_polarity.json', 'w') as f: json.dump({'e_user':e_user, 'ne_user' : ne_user, 'e_source' : e_source, 'ne_source' : ne_source},f)
def bot_participation(): Bot = bot.load_bot() dir_name = "RetweetNew/" files = os.listdir(dir_name) bot_list = [] for postid in files: with open(dir_name + postid, 'r') as f: tweets = json.load(f) users = [tweet['user'] for tweet in tweets.values()] bots = [bot.check_bot(Bot, user) for user in users] bot_list.append(bots.count(1) / bots.count(0)) box = BoxPlot(1) box.set_data(bot_list, '') box.set_xticks('bot_ratio') box.save_image('Image/bot_ratio_box.png')
def update(): """ Update retweet graph with Cascade, Bot information """ #cascade calculation cascade = {} child = {} for postid in files: cascade[postid] = {} child[postid] = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) for key in tweets.keys(): tweet = tweets[key] origin = tweet['origin_tweet'] cascade[postid][origin] = cascade[postid].get(origin, 0) + 1 parent_tweet = tweet['parent_tweet'] if parent_tweet != tweet['tweet']: child[postid][parent_tweet] = child[postid].get(parent_tweet, 0) + 1 #update Bot = bot.load_bot() for postid in files: print(postid) unique_origin = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) #print(len(tweets)) #print(cascade[postid]) for tweet in tweets.values(): tweet['cascade'] = cascade[postid][tweet['origin_tweet']] tweet['bot'] = bot.check_bot(Bot, tweet['user']) tweet['child'] = child[postid].get(tweet['tweet'], 0) unique_origin[tweet['origin_tweet']] = 1 print('unique root', len(unique_origin)) #for key in unique_origin.keys(): #print(key) # sub_tree_num(tweets, key) #with open(postid, 'w') as f: # json.dump(tweets, f) with open(dir_name+postid, 'w') as f: json.dump(tweets, f)
def get_echo_chamber_users(file_name): #file_name = 'Data/echo_chamber2.json' print(file_name) if 'echo_chamber2.json' in file_name: if os.path.exists('Data/echo_chamber_users2.json'): with open('Data/echo_chamber_users2.json', 'r') as f: echo_chamber_users = json.load(f) print('echo chamber size %s' % len(echo_chamber_users)) return echo_chamber_users with open(file_name) as f: echo_chambers = json.load(f) Bot = bot.load_bot() echo_chamber_users = {} count = 0 for key in echo_chambers: #print(key) users = echo_chambers[key] postids = key.split('_') #bot check for postid in postids: for user in users: if bot.check_bot(Bot, user) == 0: echo_chamber_users[postid] = echo_chamber_users.get( postid, {}) echo_chamber_users[postid][user] = 1 count += 1 print('echo chamber size %s' % count) with open('Data/echo_chamber_users2.json', 'w') as f: json.dump(echo_chamber_users, f) return echo_chamber_users
def propagation_parent_to_child(): Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) filename = 'Data/echo_chamber2.json' if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_chamber_cascades = {} tweet_cache = {} ''' for postid in echo_chamber_users.keys(): users = echo_chamber_users[postid] #echo chamber users with open('RetweetNew/' + postid, 'r') as f: tweets = json.load(f) tweet_cache[postid] = tweets for tweet in tweets.values(): if tweet['user'] in users: root_id = tweet['origin_tweet'] #root tweet id echo_chamber_cascades[root_id] = 1 echo_chamber_cascades_ids = echo_chamber_cascades.keys() ''' #print(echo_chamber_cascades_ids) e_child = {} ne_child = {} e_time = {} ne_time = {} ne_time2 = {} for i in range(1, 20): e_child[i] = [] ne_child[i] = [] e_time[i] = {} ne_time[i] = {} ne_time2[i] = {} print(len(echo_chamber_users.keys())) for ccc, postid in enumerate(files): #if postid != '150232' and postid != '29947': # continue with open(dirname + postid, 'r') as f: tweets = json.load(f) #tweets = tweet_cache[postid] #if not util.is_politics(postid): #if not util.is_non_politics(postid): #if not util.is_veracity(postid, 'False'): #if not util.is_veracity(postid, 'Mixture,Mostly False,Mostly True'): # continue #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] e_users = echo_chamber_users[postid] #e_users = echo_chamber_users.get(postid, []) print(len(e_users)) for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent'] origin = tweets[tid]['origin'] root = tweets[tid]['origin_tweet'] cascade = tweets[tid]['cascade'] userid = tweets[tid]['user'] ptid = tweets[tid]['parent_tweet'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, userid) != 0: continue if userid in e_users: e_child[tweets[tid]['depth']].append(tweets[tid]['child']) else: ne_child[tweets[tid]['depth']].append(tweets[tid]['child']) if tweets[tid]['depth'] > 1: diff = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[ptid]['time'])).total_seconds() / 60 if e_time[tweets[ptid]['depth']].get(ptid, -1) > diff: print(e_time[tweets[ptid]['depth']][ptid], diff) if parent in e_users: # if origin in e_users: if e_time[tweets[ptid]['depth']].get(ptid, -1) == -1: e_time[tweets[ptid]['depth']][ptid] = diff else: if ne_time[tweets[ptid]['depth']].get(ptid, -1) == -1: ne_time[tweets[ptid]['depth']][ptid] = diff #if ccc == 5: # break #remove child 0 count for i in range(1, 20): e_child[i] = [x for x in e_child[i] if x != 0] ne_child[i] = [x for x in ne_child[i] if x != 0] box = BoxPlot(1) box.set_multiple_data([e_child, ne_child]) box.set_ylog() box.set_label('Depth', 'Child Count') box.save_image('Image/%s/child_num_wo_propagation.png' % folder) for i in range(1, 20): e_time[i] = e_time[i].values() ne_time[i] = ne_time[i].values() ne_time2[i] = ne_time2[i].values() #print(e_time) #print(ne_time) box = BoxPlot(1) box.set_multiple_data([e_time, ne_time]) box.set_ylog() box.set_label('Depth', 'Propagation Time') box.save_image('Image/%s/child_time_propagation.png' % folder) with open('Data/Figure/5_3_1.json', 'w') as f: json.dump( { 'e_time': e_time, 'ne_time': ne_time, 'e_child': e_child, 'ne_child': ne_child }, f)
def propagation_time_to_group(filename): #get all echo chamber users #filename = 'Data/echo_chamber2.json' Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_p = [] echo_r = [] necho_p = [] necho_r = [] for ccc, postid in enumerate(files): #if postid != '150232' and postid != '29947': # continue with open(dirname + postid, 'r') as f: tweets = json.load(f) #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] parent_child = {} parent_start = {} root_start = {} #make one dictionary parent - children echo_chamber_parent = {} echo_chamber_root = {} echo_chamber_tweet = {} #print('echo_chamber user num ', len(echo_chamber_users[postid])) for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent_tweet'] root = tweets[tid]['origin_tweet'] cascade = tweets[tid]['cascade'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue #save all the parent, root start time if root_start.get(root, None) == None: root_start[root] = new_list[i][1] if parent_start.get(parent, None) == None: parent_start[parent] = new_list[i][1] if tweets[tid]['user'] in echo_chamber_users[postid]: echo_chamber_tweet[tid] = 1 #if tweets[tid]['parent'] in echo_chamber_users[postid]: # echo_chamber_parent[parent] = 1 #if tweets[tid]['origin'] in echo_chamber_users[postid]: # echo_chamber_root[root] = 1 #parent_child_diff[key].append((time-start_time).total_seconds() / 60) echo_parent = 0 necho_parent = 0 for tweet in tweets.values(): tid = tweet['tweet'] pid = tweet['parent_tweet'] rid = tweet['origin_tweet'] #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue if tid != pid: #not root r_time = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[rid]['time'])).total_seconds() / 60 p_time = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[pid]['time'])).total_seconds() / 60 if tweet['parent_tweet'] in echo_chamber_tweet.keys(): echo_p.append(p_time) else: necho_p.append(p_time) if tweet['origin_tweet'] in echo_chamber_tweet.keys(): echo_r.append(r_time) else: necho_r.append(r_time) if ccc % 10 == 0: print(ccc) return echo_p, necho_p, echo_r, necho_r
def rumor_propagation_velocity(filename): #get all echo chamber users #filename = 'Data/echo_chamber2.json' Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_v = [] necho_v = [] #propagation time to all node's children #parent --> last child echo_p = {} necho_p = {} for i in range(1, 20): echo_p[i] = [] necho_p[i] = [] tweet_depth = {} for ccc, postid in enumerate(files): with open(dirname + postid, 'r') as f: tweets = json.load(f) #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] #make one dictionary parent - children parent_child = {} echo_chamber_parent = {} for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent_tweet'] cascade = tweets[tid]['cascade'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue if tweet != parent: parent_child[parent] = parent_child.get(parent, []) #parent comes always earlier then child if len(parent_child[parent]) == 0: #add parent time into index 0 parent_child[parent].append( parser.parse(tweets[parent]['time'])) #time or tweet? parent_child[parent].append(new_list[i][1]) tweet_depth[parent] = tweets[parent]['depth'] else: #root tweet of cascade parent_child[parent] = [new_list[i][1]] if len(tweets[parent]) != 0: #parent is echo chamber or not if tweets[tid]['parent'] in echo_chamber_users[postid]: echo_chamber_parent[parent] = 1 #insert time diff from start time parent_child_diff = {} parent_child_median_diff = {} for key in parent_child.keys(): times = parent_child[key] #print(times) #print((max(times) - min(times)).total_seconds() / 60) parent_child_diff[key] = ( (max(times) - min(times)).total_seconds() / 60) parent_child_median_diff[key] = [] for i, time in enumerate(times): if i == 0: start_time = time continue parent_child_median_diff[key].append( (time - start_time).total_seconds() / 60) echo_parent = 0 necho_parent = 0 for key in parent_child_diff: if key in echo_chamber_parent.keys(): echo_parent += 1 #if len(parent_child_diff[key]) == 0: if parent_child_diff[key] == 0: continue #echo_p[tweet_depth[key]].append(parent_child_diff[key]) echo_p[tweet_depth[key]].append( np.median(parent_child_median_diff[key])) echo_v.append(parent_child_diff[key]) #echo_v.append(np.median(parent_child_diff[key])) else: necho_parent += 1 #if len(parent_child_diff[key]) == 0: if parent_child_diff[key] == 0: continue #necho_p[tweet_depth[key]].append(parent_child_diff[key]) necho_p[tweet_depth[key]].append( np.median(parent_child_median_diff[key])) necho_v.append(parent_child_diff[key]) #necho_v.append(np.median(parent_child_diff[key])) #print('echo') #print(echo_p) #print('necho') #print(necho_p) #if ccc == 10: # break return echo_v, necho_v, echo_p, necho_p
def get_tweet(path): ready = False with open(path, 'r') as f: lines = fileinput.FileInput(path) t = {} unique_u = {} unique_f = {} Bot = bot.load_bot() for line in lines: #print(line) tweet_dict = json.loads(line) tweet = Tweet(tweet_dict) t_id1 = tweet['id_str'] u_id1 = tweet['user']['id_str'] tweet1 = tweet['text'] screen_name = tweet['user']['screen_name'] time1 = tweet['created_at'] unique_u[u_id1] = 1 if bot.check_bot(Bot, t_id1) == 1: continue #isretweeted try: #retweet = tweet['retweeted_status'] retweet = tweet.get('retweeted_status', None) if retweet == None: retweet = tweet.get('quoted_status', None) if retweet == None: t[t_id1] = { 'user': u_id1, 'parent': u_id1, 'origin': u_id1, 'confirm': True, 'text': tweet1, 'origin_tweet': t_id1, 'parent_tweet': t_id1, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': screen_name, 'time': time1, 'depth': 1 } else: tweet2 = retweet['text'] t_id2 = retweet['id_str'] u_id2 = retweet['user']['id_str'] origin_name = retweet['user']['screen_name'] time2 = retweet['created_at'] t[t_id1] = { 'user': u_id1, 'parent': u_id2, 'origin': u_id2, 'confirm': False, 'text': tweet1, 'origin_tweet': t_id2, 'parent_tweet': t_id2, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': origin_name, 'time': time1, 'depth': 2 } t[t_id2] = { 'user': u_id2, 'parent': u_id2, 'origin': u_id2, 'confirm': True, 'text': tweet2, 'origin_tweet': t_id2, 'parent_tweet': t_id2, 'tweet': t_id2, 'screen_name': origin_name, 'origin_name': origin_name, 'time': time2, 'depth': 1 } unique_u[u_id2] = 1 except KeyError as e: #no retweeted print("Key Error Exception!!!!") t[t_id1] = { 'user': u_id1, 'parent': u_id1, 'origin': u_id1, 'confirm': True, 'text': tweet1, 'origin_tweet': t_id1, 'parent_tweet': t_id1, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': screen_name, 'time': time1, 'depth': 1 } #print(tweet.created_at_string, tweet.all_text) # if follower, origin_follwer, friends counts are same as unique users, then struct retweet networks # and the number of tweets are more than 100, else return None f_count = 0 fr_count = 0 for uid in unique_u.keys(): user_path = '../Data/followers/followers/' + uid if os.path.exists(user_path): f_count += 1 for uid in unique_u.keys(): user_path = '../Data/friends/friends/' + uid if os.path.exists(user_path): fr_count += 1 if len(t) <= 100: return 0, None print(path) print( 'unique_users : %s , collected followers : %s, collected friends : %s' % (len(unique_u), f_count, fr_count)) if f_count == len(unique_u) and fr_count == len(unique_u): print('%s : %s tweets' % (path, len(t))) return 1, t elif f_count == len(unique_u): print('%s : %s tweets' % (path, len(t))) return 2, t elif fr_count == len(unique_u): return 3, t else: return 0, t