def get_depth(politic=None, veracity=None, echo_chamber=False): print(politic, veracity, echo_chamber) dir_name = "RetweetNew/" files = os.listdir(dir_name) unique_d = {} count = 0 if echo_chamber == True: echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json') e_d = {} ne_d = {} breadth, depth, unique_users = e_util.get_cascade_max_breadth() for postid in files: if veracity != None: if not get_veracity(postid, veracity): continue ##print(postid, veracity) if politic == True: if util.is_politics(postid) == False: continue if politic == False: if util.is_non_politics(postid) == False: continue echo_chamber_cascade_root = {} unique_root = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) for tweet in tweets.values(): if tweet['user'] in echo_chamber_users[postid].keys(): echo_chamber_cascade_root[tweet['origin_tweet']] = 1 unique_root[tweet['origin_tweet']] = 1 #unique_d[postid][tweet['origin_tweet']] = tweet['depth'] print(len(unique_root), len(echo_chamber_cascade_root)) echo_chamber_cascades = echo_chamber_cascade_root.keys() for key in unique_root.keys(): unique_d[key] = depth[key] if key in echo_chamber_cascades: e_d[key] = depth[key] else: ne_d[key] = depth[key] #count += 1 #if count > 4 : # break return unique_d, e_d, ne_d
def political_alignment_pearson(): with open('Data/user_content_polarity.json', 'r') as f: content_polarity = json.load(f) echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json') files = os.listdir('RetweetNew') e_user = {} ne_user = {} e_source = {} ne_source = {} Bot = bot.load_bot() for ccc, postid in enumerate(files): with open('RetweetNew/' + postid, 'r') as f: tweets = json.load(f) print(ccc, postid, len(tweets)) echo_users = echo_chamber_users[postid] for tweet in tweets.values(): user = tweet['user'] if bot.check_bot(Bot, user) == 1: continue if e_user.get(user, None) != None or ne_user.get(user, None) != None: continue user_politic_score = round(get_polarity(user),4) content_politic_score = content_polarity.get(user, None) if user_politic_score != None and content_politic_score != None: if user in echo_users: e_user[user] = user_politic_score e_source[user] = content_politic_score else: ne_user[user] = user_politic_score ne_source[user] = content_politic_score #if ccc == 10: # break # break e_keys = e_user.keys() ne_keys = ne_user.keys() #print('echo', stats.pearsonr(e_user.values(), e_source.values())) #print('necho', stats.pearsonr(ne_user.values(), ne_source.values())) print('echo', stats.pearsonr([e_user[key] for key in e_keys], [e_source[key] for key in e_keys])) print('necho', stats.pearsonr([ne_user[key] for key in ne_keys], [ne_source[key] for key in ne_keys])) with open('Data/user_polarity_content_polarity.json', 'w') as f: json.dump({'e_user':e_user, 'ne_user' : ne_user, 'e_source' : e_source, 'ne_source' : ne_source},f)
def propagation_to_depth_politic(filename): #get echo chamber cascade only echo_chamber_values = {} non_echo_chamber_values = {} echo_politics = {}; echo_non_politics = {}; non_echo_politics = {}; non_echo_non_politics = {}; ranked_echo_politics = {}; ranked_echo_non_politics = {} politics = {} non_politics = {} for item in ['time_depth', 'user_depth']: echo_chamber_values[item] = {} non_echo_chamber_values[item] = {} politics[item] = {} non_politics[item] = {} echo_politics[item] = {} echo_non_politics[item] = {} non_echo_politics[item] = {} non_echo_non_politics[item] = {} ranked_echo_politics[item] = {} ranked_echo_non_politics[item] = {} for i in range(1,20): echo_chamber_values[item][i] = [] non_echo_chamber_values[item][i] = [] politics[item][i] = [] non_politics[item][i] = [] echo_politics[item][i] = [] echo_non_politics[item][i] = [] non_echo_politics[item][i] = [] non_echo_non_politics[item][i] = [] ranked_echo_politics[item][i] = [] ranked_echo_non_politics[item][i] = [] echo_chamber_cascade_root = {} #cascade which echo chamber users participated in ranked_echo_chamber_cascade_root = {} #cascade which echo chamber users participated in cascade_veracity = {} echo_chamber_users = {} politic_cascade = {} #contain cascade root of political rumors non_politic_cascade = {} echo_chamber_users = e_util.get_echo_chamber_users(filename) with open('Data/degree_ranked_users.json', 'r') as f: ranked_echo_chamber_users = json.load(f) print(ranked_echo_chamber_users.keys()) files = os.listdir('RetweetNew') #for postid in echo_chamber_users.keys(): for postid in files: #if not get_veracity(postid, 'Mixture,Mostly True,Mostly False'): #if not get_veracity(postid, 'False'): # continue v = veracity_type(postid).title() #get origin tweet of echo chamber user politic_num = 0 if util.is_politics(postid): politic_num = 1 elif util.is_non_politics(postid): politic_num = 2 with open('RetweetNew/%s'%postid, 'r') as f: tweets = json.load(f) for tweet in tweets.values(): try: if tweet['user'] in echo_chamber_users[postid].keys(): echo_chamber_cascade_root[tweet['origin_tweet']] = 1 except KeyError : pass try: if tweet['user'] in ranked_echo_chamber_users[postid].keys(): ranked_echo_chamber_cascade_root[tweet['origin_tweet']] = 1 except KeyError : pass cascade_veracity[tweet['origin_tweet']] = v if politic_num == 1: politic_cascade[tweet['origin_tweet']] = 1 elif politic_num ==2 : non_politic_cascade[tweet['origin_tweet']] = 1 #print(set(cascade_veracity.values())) print("echo chamber cascade extraction done") _, _, time_depth, _, user_depth = get_depth_time_series('False') print("time series data load done ") echo_chamber_cascades = echo_chamber_cascade_root.keys() ranked_echo_chamber_cascades = ranked_echo_chamber_cascade_root.keys() print(len(ranked_echo_chamber_cascades)) political_cascades = politic_cascade.keys() non_political_cascades = non_politic_cascade.keys() #print('echo chamber cascades') #print(echo_chamber_cascades) for key in time_depth.keys(): if key in political_cascades: #political rumors if key in echo_chamber_cascades: echo = 1 else : echo = 0 if key in ranked_echo_chamber_cascades: ranked = 1 else: ranked = 0 for i in range(1, max(time_depth[key].keys())): try: politics['time_depth'][i].append(time_depth[key][i]) politics['user_depth'][i].append(user_depth[key][i]) if echo == 1: #echo political echo_politics['time_depth'][i].append(time_depth[key][i]) echo_politics['user_depth'][i].append(user_depth[key][i]) else: #non echo political non_echo_politics['time_depth'][i].append(time_depth[key][i]) non_echo_politics['user_depth'][i].append(user_depth[key][i]) if ranked == 1: #echo political ranked_echo_politics['time_depth'][i].append(time_depth[key][i]) ranked_echo_politics['user_depth'][i].append(user_depth[key][i]) except KeyError : pass if key in non_political_cascades: if key in echo_chamber_cascades: echo = 1 else : echo = 0 if key in ranked_echo_chamber_cascades: ranked = 1 print(222) else: ranked = 0 for i in range(1, max(time_depth[key].keys())): try: non_politics['time_depth'][i].append(time_depth[key][i]) non_politics['user_depth'][i].append(user_depth[key][i]) if echo == 1: #echo political echo_non_politics['time_depth'][i].append(time_depth[key][i]) echo_non_politics['user_depth'][i].append(user_depth[key][i]) else: #non echo political non_echo_non_politics['time_depth'][i].append(time_depth[key][i]) non_echo_non_politics['user_depth'][i].append(user_depth[key][i]) if ranked == 1: #echo non political print(222) ranked_echo_non_politics['time_depth'][i].append(time_depth[key][i]) ranked_echo_non_politics['user_depth'][i].append(user_depth[key][i]) except KeyError : pass #draw time to depth, user to depth of cascade for echo chamber users participated or non echo chamer users participated draw_time_to_depth_echo_chamber([politics['time_depth'], non_politics['time_depth']], ['Politics', 'Other'], 'Median Minutes', 'time_depth_politics_line') draw_time_to_depth_echo_chamber([politics['user_depth'], non_politics['user_depth']], ['Politics', 'Other'], 'Median Unique Users', 'user_depth_politics_line') #compare echo chamber users participated cascades and non echo chamber users participated cascades for politics and others draw_time_to_depth_echo_chamber([echo_politics['time_depth'], non_echo_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Minutes', 'time_depth_politics_echo_line') draw_time_to_depth_echo_chamber([echo_politics['user_depth'], non_echo_politics['user_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Unique Users', 'user_depth_politics_echo_line') draw_time_to_depth_echo_chamber([echo_non_politics['time_depth'], non_echo_non_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Minutes', 'time_depth_non_politics_echo_line') draw_time_to_depth_echo_chamber([echo_non_politics['user_depth'], non_echo_non_politics['user_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Unique Users', 'user_depth_non_politics_echo_line') draw_time_to_depth_echo_chamber([echo_politics['time_depth'], non_echo_politics['time_depth'], ranked_echo_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber', 'Ranked Echo Chamber'], 'Median Minutes', 'time_depth_politics_echo_line_ranked') draw_time_to_depth_echo_chamber([echo_non_politics['time_depth'], non_echo_non_politics['time_depth'], ranked_echo_non_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber', 'Ranked Echo Chamber'], 'Median Minutes', 'time_depth_non_politics_echo_line_ranked')
def time_to_depth_echo_chamber(filename): _, _, time_depth, _, user_depth = get_depth_time_series(None) print(len(time_depth)) #with open('Data/time_series_data.json', 'w') as f: # json.dump({'time_depth' : time_depth, 'user_depth' : user_depth}, f) #with open('Data/time_series_data.json', 'r') as f: # data = json.load(f) #time_depth = data['time_depth'] #user_depth = data['user_depth'] print("time series data load done ") echo_chamber_values = {} non_echo_chamber_values = {} for item in ['time_depth', 'user_depth']: echo_chamber_values[item] = {} non_echo_chamber_values[item] = {} for i in range(1,20): echo_chamber_values[item][i] = [] non_echo_chamber_values[item][i] = [] Bot = bot.load_bot() echo_chamber_cascade_root = {} cascade_veracity = {} echo_chamber_users = e_util.get_echo_chamber_users(filename) files = os.listdir('RetweetNew') #collect echo chamber user participate cascade #for postid in echo_chamber_users.keys(): for postid in files: v = veracity_type(postid).title() #get origin tweet of echo chamber user with open('RetweetNew/%s'%postid, 'r') as f: tweets = json.load(f) for tweet in tweets.values(): try: #if tweet['user'] in echo_chamber_users[postid].keys(): origin = tweet['origin'] otid = tweet['origin_tweet'] #if origin in echo_chamber_users[postid].keys(): if tweet['user'] in echo_chamber_users[postid].keys(): echo_chamber_cascade_root[tweet['origin_tweet']] = 1 except KeyError : pass cascade_veracity[tweet['origin_tweet']] = v print("echo chamber cascade extraction done") echo_chamber_cascades = echo_chamber_cascade_root.keys() print('echo chamber cascades') #print(echo_chamber_cascades) e = {}; n = {}; r = {}; #echo, non echo, ranked echo for item in ['True', 'False', 'Mixed']: e[item] = {} n[item] = {} r[item] = {} for d_type in ['user_depth', 'time_depth']: e[item][d_type] = {} n[item][d_type] = {} r[item][d_type] = {} for i in range(1, 20): e[item][d_type][i] = [] n[item][d_type][i] = [] r[item][d_type][i] = [] for key in time_depth.keys(): v = cascade_veracity[key] if v !='True' and v != 'False': v = 'Mixed' if key in echo_chamber_cascades: #for i in range(1, max(time_depth[key].keys())+1): for i in range(1, max(time_depth[key].keys())+1): try: echo_chamber_values['time_depth'][i].append(time_depth[key][i]) echo_chamber_values['user_depth'][i].append(user_depth[key][i]) e[v]['time_depth'][i].append(time_depth[key][i]) e[v]['user_depth'][i].append(user_depth[key][i]) except KeyError: pass else: for i in range(1, max(time_depth[key].keys())+1): try : non_echo_chamber_values['time_depth'][i].append(time_depth[key][i]) non_echo_chamber_values['user_depth'][i].append(user_depth[key][i]) n[v]['time_depth'][i].append(time_depth[key][i]) n[v]['user_depth'][i].append(user_depth[key][i]) except KeyError: pass box = BoxPlot(1) box.set_multiple_data([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']]) box.set_ylog() box.set_label('Depth', 'Minutes to Depth') box.save_image('%s/time_depth_echo_chamber_box.png'%foldername) print(echo_chamber_values['time_depth']) #draw time to depth, user to depth of cascade for echo chamber users participated or non echo chamer users participated with open('Data/Figure/5_2_1.json', 'w') as f: json.dump([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], f) draw_time_to_depth_echo_chamber([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], ['echo chamber', 'no echo chamber'], 'median minutes', 'time_depth_echo_chamber_line') draw_time_to_depth_echo_chamber([echo_chamber_values['user_depth'], non_echo_chamber_values['user_depth']], ['echo chamber', 'no echo chamber'], 'median unique users', 'user_depth_echo_chamber_line') with open('Data/Figure/5_2_time.json', 'w') as f: json.dump({'e':echo_chamber_values['time_depth'][1], 'ne':non_echo_chamber_values['time_depth'][1]}, f) #draw cdf with top retweet cdf = CDFPlot() cdf.set_label('Propagation Time', 'CDF') cdf.set_log(True) #cdf.set_ylog() cdf.set_data(echo_chamber_values['time_depth'][1], '') cdf.set_data(non_echo_chamber_values['time_depth'][1], '') cdf.save_image('Image/20181105/depth_propagation_time_cdf.png') """
def echo_chamber_diversity(filename): Bot = bot.load_bot() dirname = 'Retweet/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_tweet_diversity = []; echo_source_diversity = []; necho_tweet_diversity = []; necho_source_diversity = []; for postid in files: with open(dirname + postid) as f: tweets = json.load(f) non_echo_users = {} for tweet in tweets.values(): user = tweet['user'] #non echo chamber collect if not user in echo_chamber_users[postid]: non_echo_users[user] = 1 print(len(echo_chamber_users[postid]), len(non_echo_users)) timeline_dir = '../Timeline/' #collect echo chamber users' source diversity err = 0; nerr = 0 for user in echo_chamber_users[postid]: try: with open(timeline_dir + user, 'r') as f: user_tweets = json.load(f) except IOError as e: #print(e) err +=1 continue tweet_diversity, source_diversity = get_diversity(user_tweets) if tweet_diversity != None: echo_tweet_diversity.append(tweet_diversity) if source_diversity != None: echo_source_diversity.append(source_diversity) for user in non_echo_users: try: with open(timeline_dir + user, 'r') as f: user_tweets = json.load(f) except IOError as e: #print(e) nerr += 1 continue tweet_diversity, source_diversity = get_diversity(user_tweets) if tweet_diversity != None: necho_tweet_diversity.append(tweet_diversity) if source_diversity != None: necho_source_diversity.append(source_diversity) #print(err, nerr) #break #CDF cdf = CDFPlot() cdf.set_label('Retweet Origin Diversity', 'CDF') #cdf.set_log(True) cdf.set_data(echo_tweet_diversity, 'Echo Chamber') cdf.set_data(necho_tweet_diversity, 'Non Echo Chamber') cdf.set_legends(['Echo CHamber', 'Non Echo CHamber'], 'User Type') cdf.save_image('Image/20181002/source_diversity_retweet_cdf.png') cdf = CDFPlot() cdf.set_label('Source News Diversity', 'CDF') #cdf.set_log(True) cdf.set_data(echo_source_diversity, 'Echo Chamber') cdf.set_data(necho_source_diversity, 'Non Echo Chamber') cdf.set_legends(['Echo CHamber', 'Non Echo CHamber'], 'User Type') cdf.save_image('Image/20181002/source_diversity_news_cdf.png') #BoxPlot box = BoxPlot(1) box.set_data([echo_tweet_diversity, necho_tweet_diversity],'') box.set_xticks(['Echo Chamber', 'Non Echo Chamber', 'All']) box.set_label('', 'Retweet Origin Diversity') box.save_image('Image/20181002/source_diversity_retweet.png') box = BoxPlot(1) box.set_data([echo_source_diversity, necho_source_diversity],'') box.set_xticks(['Echo Chamber', 'Non Echo Chamber', 'All']) box.set_label('', 'Source News Diversity') box.save_image('Image/20181002/source_diversity_news.png')
def political_alignment(): source_politic_score = {} echo_chamber_users = e_util.get_echo_chamber_users(filename) #load source information file with open('Data/top500.tab', 'r') as f: i = 0 for row in f: if i != 0: items = row.split('\t') items[0] = items[0].replace("www.", "") source_politic_score[items[0]] = (float(items[1]) + 1) / 2 i += 1 #print(source_politic_score.keys()) source_list = source_politic_score.keys() #check users political score in the rumor propagation files = os.listdir(dir_name) timeline_dir = '../Timeline/' user_content_polarity = {} #user - polarity postid_content_polarity = {} all_echo_user_score = []; all_non_echo_user_score = [] all_echo_source_score = []; all_non_echo_source_score = [] for postid in files: user_score = []; echo_user_score = []; non_echo_user_score = [] source_score = []; echo_source_score = []; non_echo_source_score = [] path = '%s/selective_exposure_%s'%(folder, postid) if postid != '142256': continue postid_content_polarity[postid] = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) #if len(tweets) > 3000 or len(tweets) < 70: # continue print(postid, len(tweets)) users = list(set([tweet['user'] for tweet in tweets.values()])) #collect echo chamber users' source diversity err = 0; nerr = 0 count_zero_users = 0 for user in users: if user_content_polarity.get(user, None) != None: postid_content_polarity[postid][user] = user_content_polarity.get(user) continue try: with open(timeline_dir + user, 'r') as f: user_tweets = json.load(f) except IOError as e: #print(e) err +=1 continue except ValueError as e: err += 1 continue urls, expanded_urls = timeline_urls(user_tweets) if len(urls) == 0: continue count =0 p_sum = 0 for url in urls: if url in source_list: count += 1 idx = source_list.index(url) #print(url, source_politic_score.keys()[idx], source_politic_score[source_politic_score.keys()[idx]]) p_sum += source_politic_score[source_list[idx]] if count == 0: count_zero_users += 1 continue p_mean = round(p_sum / count, 4) user_politic_score = round(get_polarity(user),4) if user_politic_score != None: user_score.append(user_politic_score) source_score.append(p_mean) if user in echo_chamber_users[postid]: echo_user_score.append(user_politic_score) echo_source_score.append(p_mean) all_echo_user_score.append(user_politic_score) all_echo_source_score.append(p_mean) else: non_echo_user_score.append(user_politic_score) non_echo_source_score.append(p_mean) all_non_echo_user_score.append(user_politic_score) all_non_echo_source_score.append(p_mean) user_content_polarity[user] = p_mean postid_content_polarity[postid][user] = p_mean #calculate until 1000 users cause it takes too much time #if len(user_score) > 1500: # break print('count zero users : %s'%count_zero_users) print('save selective exposure file') filefolder = 'Data/SelectiveExposure/' if not os.path.exists(filefolder): os.makedirs(filefolder) print('echo', stats.pearsonr(echo_user_score, echo_source_score)[0]) print('necho', stats.pearsonr(non_echo_user_score, non_echo_source_score)[0]) # break datapath = filefolder + postid + '_polarity' with open(datapath, 'w') as f : json.dump({'necho_user' : non_echo_user_score, 'necho_source' : non_echo_source_score, 'echo_user' : echo_user_score, 'echo_source' : echo_source_score}, f) snsplot.draw_echo_plot(non_echo_user_score, non_echo_source_score, echo_user_score, echo_source_score, path) with open(filefolder + postid, 'w') as f: json.dump(postid_content_polarity[postid], f) print('echo', stats.pearsonr(all_echo_user_score, all_echo_source_score)) print('necho', stats.pearsonr(all_non_echo_user_score, all_non_echo_source_score)) with open('Data/user_content_polarity.json', 'w') as f: json.dump(user_content_polarity, f) with open('Data/user_content_polarity_postid.json', 'w') as f: json.dump(postid_content_polarity, f)
def propagation_parent_to_child(): Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) filename = 'Data/echo_chamber2.json' if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_chamber_cascades = {} tweet_cache = {} ''' for postid in echo_chamber_users.keys(): users = echo_chamber_users[postid] #echo chamber users with open('RetweetNew/' + postid, 'r') as f: tweets = json.load(f) tweet_cache[postid] = tweets for tweet in tweets.values(): if tweet['user'] in users: root_id = tweet['origin_tweet'] #root tweet id echo_chamber_cascades[root_id] = 1 echo_chamber_cascades_ids = echo_chamber_cascades.keys() ''' #print(echo_chamber_cascades_ids) e_child = {} ne_child = {} e_time = {} ne_time = {} ne_time2 = {} for i in range(1, 20): e_child[i] = [] ne_child[i] = [] e_time[i] = {} ne_time[i] = {} ne_time2[i] = {} print(len(echo_chamber_users.keys())) for ccc, postid in enumerate(files): #if postid != '150232' and postid != '29947': # continue with open(dirname + postid, 'r') as f: tweets = json.load(f) #tweets = tweet_cache[postid] #if not util.is_politics(postid): #if not util.is_non_politics(postid): #if not util.is_veracity(postid, 'False'): #if not util.is_veracity(postid, 'Mixture,Mostly False,Mostly True'): # continue #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] e_users = echo_chamber_users[postid] #e_users = echo_chamber_users.get(postid, []) print(len(e_users)) for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent'] origin = tweets[tid]['origin'] root = tweets[tid]['origin_tweet'] cascade = tweets[tid]['cascade'] userid = tweets[tid]['user'] ptid = tweets[tid]['parent_tweet'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, userid) != 0: continue if userid in e_users: e_child[tweets[tid]['depth']].append(tweets[tid]['child']) else: ne_child[tweets[tid]['depth']].append(tweets[tid]['child']) if tweets[tid]['depth'] > 1: diff = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[ptid]['time'])).total_seconds() / 60 if e_time[tweets[ptid]['depth']].get(ptid, -1) > diff: print(e_time[tweets[ptid]['depth']][ptid], diff) if parent in e_users: # if origin in e_users: if e_time[tweets[ptid]['depth']].get(ptid, -1) == -1: e_time[tweets[ptid]['depth']][ptid] = diff else: if ne_time[tweets[ptid]['depth']].get(ptid, -1) == -1: ne_time[tweets[ptid]['depth']][ptid] = diff #if ccc == 5: # break #remove child 0 count for i in range(1, 20): e_child[i] = [x for x in e_child[i] if x != 0] ne_child[i] = [x for x in ne_child[i] if x != 0] box = BoxPlot(1) box.set_multiple_data([e_child, ne_child]) box.set_ylog() box.set_label('Depth', 'Child Count') box.save_image('Image/%s/child_num_wo_propagation.png' % folder) for i in range(1, 20): e_time[i] = e_time[i].values() ne_time[i] = ne_time[i].values() ne_time2[i] = ne_time2[i].values() #print(e_time) #print(ne_time) box = BoxPlot(1) box.set_multiple_data([e_time, ne_time]) box.set_ylog() box.set_label('Depth', 'Propagation Time') box.save_image('Image/%s/child_time_propagation.png' % folder) with open('Data/Figure/5_3_1.json', 'w') as f: json.dump( { 'e_time': e_time, 'ne_time': ne_time, 'e_child': e_child, 'ne_child': ne_child }, f)
def propagation_time_to_group(filename): #get all echo chamber users #filename = 'Data/echo_chamber2.json' Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_p = [] echo_r = [] necho_p = [] necho_r = [] for ccc, postid in enumerate(files): #if postid != '150232' and postid != '29947': # continue with open(dirname + postid, 'r') as f: tweets = json.load(f) #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] parent_child = {} parent_start = {} root_start = {} #make one dictionary parent - children echo_chamber_parent = {} echo_chamber_root = {} echo_chamber_tweet = {} #print('echo_chamber user num ', len(echo_chamber_users[postid])) for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent_tweet'] root = tweets[tid]['origin_tweet'] cascade = tweets[tid]['cascade'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue #save all the parent, root start time if root_start.get(root, None) == None: root_start[root] = new_list[i][1] if parent_start.get(parent, None) == None: parent_start[parent] = new_list[i][1] if tweets[tid]['user'] in echo_chamber_users[postid]: echo_chamber_tweet[tid] = 1 #if tweets[tid]['parent'] in echo_chamber_users[postid]: # echo_chamber_parent[parent] = 1 #if tweets[tid]['origin'] in echo_chamber_users[postid]: # echo_chamber_root[root] = 1 #parent_child_diff[key].append((time-start_time).total_seconds() / 60) echo_parent = 0 necho_parent = 0 for tweet in tweets.values(): tid = tweet['tweet'] pid = tweet['parent_tweet'] rid = tweet['origin_tweet'] #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue if tid != pid: #not root r_time = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[rid]['time'])).total_seconds() / 60 p_time = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[pid]['time'])).total_seconds() / 60 if tweet['parent_tweet'] in echo_chamber_tweet.keys(): echo_p.append(p_time) else: necho_p.append(p_time) if tweet['origin_tweet'] in echo_chamber_tweet.keys(): echo_r.append(r_time) else: necho_r.append(r_time) if ccc % 10 == 0: print(ccc) return echo_p, necho_p, echo_r, necho_r
def rumor_propagation_velocity(filename): #get all echo chamber users #filename = 'Data/echo_chamber2.json' Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_v = [] necho_v = [] #propagation time to all node's children #parent --> last child echo_p = {} necho_p = {} for i in range(1, 20): echo_p[i] = [] necho_p[i] = [] tweet_depth = {} for ccc, postid in enumerate(files): with open(dirname + postid, 'r') as f: tweets = json.load(f) #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] #make one dictionary parent - children parent_child = {} echo_chamber_parent = {} for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent_tweet'] cascade = tweets[tid]['cascade'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue if tweet != parent: parent_child[parent] = parent_child.get(parent, []) #parent comes always earlier then child if len(parent_child[parent]) == 0: #add parent time into index 0 parent_child[parent].append( parser.parse(tweets[parent]['time'])) #time or tweet? parent_child[parent].append(new_list[i][1]) tweet_depth[parent] = tweets[parent]['depth'] else: #root tweet of cascade parent_child[parent] = [new_list[i][1]] if len(tweets[parent]) != 0: #parent is echo chamber or not if tweets[tid]['parent'] in echo_chamber_users[postid]: echo_chamber_parent[parent] = 1 #insert time diff from start time parent_child_diff = {} parent_child_median_diff = {} for key in parent_child.keys(): times = parent_child[key] #print(times) #print((max(times) - min(times)).total_seconds() / 60) parent_child_diff[key] = ( (max(times) - min(times)).total_seconds() / 60) parent_child_median_diff[key] = [] for i, time in enumerate(times): if i == 0: start_time = time continue parent_child_median_diff[key].append( (time - start_time).total_seconds() / 60) echo_parent = 0 necho_parent = 0 for key in parent_child_diff: if key in echo_chamber_parent.keys(): echo_parent += 1 #if len(parent_child_diff[key]) == 0: if parent_child_diff[key] == 0: continue #echo_p[tweet_depth[key]].append(parent_child_diff[key]) echo_p[tweet_depth[key]].append( np.median(parent_child_median_diff[key])) echo_v.append(parent_child_diff[key]) #echo_v.append(np.median(parent_child_diff[key])) else: necho_parent += 1 #if len(parent_child_diff[key]) == 0: if parent_child_diff[key] == 0: continue #necho_p[tweet_depth[key]].append(parent_child_diff[key]) necho_p[tweet_depth[key]].append( np.median(parent_child_median_diff[key])) necho_v.append(parent_child_diff[key]) #necho_v.append(np.median(parent_child_diff[key])) #print('echo') #print(echo_p) #print('necho') #print(necho_p) #if ccc == 10: # break return echo_v, necho_v, echo_p, necho_p
def mean_edge_homogeneity2(filename): #compare with echo chamber node's edge homogeneity echo_chamber_users = {} e_homogeneity = [] ne_homogeneity = [] retweet_cache = {} echo_chamber_users = e_util.get_echo_chamber_users(filename) parent_child = {} files = os.listdir('RetweetNew') #for all cascade for postid in files: parent_child[postid] = {} #for postid in echo_chamber_users.keys(): if retweet_cache.get(postid, None) == None: with open(dir_name + '%s'%postid, 'r') as f: tweets = json.load(f) retweet_cache[postid] = tweets else: tweets = retweet_cache[postid] echo_users = echo_chamber_users[postid] #parent_child[postid] = {} #parent-children #make parent - children map for tweet in tweets.values(): #echo chamber user's edge homogeneity if tweet['cascade'] == 1: continue if tweet['parent'] != tweet['user']: #if parent is echo chamber then add only echo chamber users parent_child[postid][tweet['parent']] = parent_child[postid].get(tweet['parent'], []) if tweet['parent'] in echo_users: #print(tweet['parent']) if tweet['user'] in echo_users: parent_child[postid][tweet['parent']].append(tweet['user']) else: if tweet['user'] not in echo_users: parent_child[postid][tweet['parent']].append(tweet['user']) #for postid in echo_chamber_users.keys(): ############# files = os.listdir('RetweetNew') #for all cascade for postid in files: #print(postid) if retweet_cache.get(postid, None) == None: with open(dir_name + '%s'%postid, 'r') as f: tweets = retweet_cache[postid] retweet_cache[postid] = tweets else: tweets = retweet_cache[postid] echo_users = echo_chamber_users[postid] #print([len(item) for item in parent_child.values()]) #print(len(parent_child.keys())) #print(len(tweets)) for tweet in tweets.values(): if parent_child[postid].get(tweet['user'], None) != None: #check parent and childen are echo chamber group or non-echo chamber group parent = tweet['user'] children = parent_child[postid][parent] #check echo only has echo users """ if parent in echo_users: is_echo = True else: is_echo = False child_count = 0 #print('is echo', is_echo) for child in children: if is_echo: if child in echo_users: child_count += 1 else: if child not in echo_users: child_count += 1 #print(is_echo, len(children), child_count) #print(parent, children) """ #convert user and children's political score p_score = get_polarity(tweet['user']) c_scores = [get_polarity(c_user) for c_user in parent_child[postid][tweet['user']]] c_scores = list(filter(lambda x : x != -999, c_scores)) if p_score == -999 or len(c_scores) == 0: continue multiple = list(map(lambda x: x * p_score , c_scores)) mean_edge_homogeneity = np.mean(multiple) #print('mean', np.mean(multiple)) if tweet['user'] in echo_users: e_homogeneity.append(mean_edge_homogeneity) else: ne_homogeneity.append(mean_edge_homogeneity) #break with open('Data/Figure/4_2_1.json', 'w') as f: json.dump({'e': e_homogeneity, 'ne': ne_homogeneity}, f) #pdf.draw_pdf({'e': e_homogeneity, 'ne': ne_homogeneity}, '', ['Echo chamber' , 'Non-echo chamber'], 'Image/%s/echo_mean_edge_homogeneity.png'%foldername) draw_cdf_plot([e_homogeneity, ne_homogeneity], '', ['Echo chamber', 'Non-echo chamber'], '', 'echo_mean_edge_homogeneity') return e_homogeneity
def mean_edge_homogeneity(filename): #compare with echo chamber node's edge homogeneity echo_chamber_users = {} e_homogeneity = [] ne_homogeneity = [] retweet_cache = {} echo_chamber_users = e_util.get_echo_chamber_users(filename) parent_child = {} for postid in echo_chamber_users.keys(): if retweet_cache.get(postid, None) == None: with open(dir_name + '%s'%postid, 'r') as f: tweets = json.load(f) retweet_cache[postid] = tweets else: tweets = retweet_cache[postid] parent_child[postid] = {} #parent-children #make parent - children map for tweet in tweets.values(): #echo chamber user's edge homogeneity if tweet['cascade'] == 1: continue if tweet['parent'] != tweet['user']: parent_child[postid][tweet['parent']] = parent_child[postid].get(tweet['parent'], []) parent_child[postid][tweet['parent']].append(tweet['user']) for postid in echo_chamber_users.keys(): ############# #files = os.listdir('RetweetNew') #for all cascade #for postid in files: if retweet_cache.get(postid, None) == None: with open(dir_name + '%s'%postid, 'r') as f: tweets = retweet_cache[postid] retweet_cache[postid] = tweets else: tweets = retweet_cache[postid] for tweet in tweets.values(): if parent_child[postid].get(tweet['user'], None) != None: #convert user and children's political score p_score = get_polarity(tweet['user']) c_scores = [get_polarity(c_user) for c_user in parent_child[postid][tweet['user']]] c_scores = list(filter(lambda x : x != -999, c_scores)) if p_score == -999 or len(c_scores) == 0: continue multiple = list(map(lambda x: x * p_score , c_scores)) mean_edge_homogeneity = np.mean(multiple) #print('mean', np.mean(multiple)) if tweet['user'] in echo_chamber_users[postid]: e_homogeneity.append(mean_edge_homogeneity) else: ne_homogeneity.append(mean_edge_homogeneity) #with open('Data/Figure/4_2_1.json', 'w') as f: # json.dump({'e': e_homogeneity, 'ne': ne_homogeneity}, f) #pdf.draw_pdf({'e': e_homogeneity, 'ne': ne_homogeneity}, '', ['Echo chamber' , 'Non-echo chamber'], 'Image/%s/mean_edge_homogeneity.png'%foldername) pdf.draw_pdf({'e': e_homogeneity, 'ne': ne_homogeneity}, '', ['Echo chamber' , 'Non-echo chamber'], 'Image/%s/mean_edge_homogeneity.png'%foldername) draw_cdf_plot([e_homogeneity, ne_homogeneity], '', ['Echo chamber', 'Non-echo chamber'], '', 'mean_edge_homogeneity') return e_homogeneity