Beispiel #1
0
def get_depth(politic=None, veracity=None, echo_chamber=False):
    print(politic, veracity, echo_chamber)
    dir_name = "RetweetNew/"
    files = os.listdir(dir_name)
    unique_d = {}
    count = 0
    if echo_chamber == True:
        echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json')
        e_d = {}
        ne_d = {}

    breadth, depth, unique_users = e_util.get_cascade_max_breadth()
    for postid in files:
        if veracity != None:
            if not get_veracity(postid,  veracity):
                continue
        ##print(postid, veracity)

        if politic == True:
            if util.is_politics(postid) == False:
                continue

        if politic == False:
            if util.is_non_politics(postid) == False:
                continue

        echo_chamber_cascade_root = {}
        unique_root = {}
        with open(dir_name + postid, 'r') as f:
            tweets = json.load(f)
                    
        
            for tweet in tweets.values():
                
                if tweet['user'] in echo_chamber_users[postid].keys():
                    echo_chamber_cascade_root[tweet['origin_tweet']] = 1

                unique_root[tweet['origin_tweet']] = 1
                #unique_d[postid][tweet['origin_tweet']] = tweet['depth']

            print(len(unique_root), len(echo_chamber_cascade_root))
            echo_chamber_cascades = echo_chamber_cascade_root.keys()
            for key in unique_root.keys():
                unique_d[key] = depth[key]

                if key in echo_chamber_cascades:        
                    e_d[key] = depth[key]
                else:
                    ne_d[key] = depth[key]
                    
        #count += 1
        #if count > 4 :
        #    break

    return unique_d, e_d, ne_d
def political_alignment_pearson():
    with open('Data/user_content_polarity.json', 'r') as f:
        content_polarity = json.load(f)

    echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json')

    files = os.listdir('RetweetNew')
    e_user = {}
    ne_user = {}
    e_source = {}
    ne_source = {}
    Bot = bot.load_bot()
    for ccc, postid in enumerate(files):
        with open('RetweetNew/' + postid, 'r') as f:
            tweets = json.load(f)
        print(ccc, postid, len(tweets))
        echo_users = echo_chamber_users[postid]
        for tweet in tweets.values():
            user = tweet['user']

            
            if bot.check_bot(Bot, user) == 1:
                continue
            
            if e_user.get(user, None) != None or ne_user.get(user, None) != None:
                continue

            user_politic_score = round(get_polarity(user),4)
            content_politic_score = content_polarity.get(user, None)

            if user_politic_score != None and content_politic_score != None:

                if user in echo_users:
                    e_user[user] = user_politic_score
                    e_source[user] = content_politic_score
                else:
                    ne_user[user] = user_politic_score
                    ne_source[user] = content_politic_score
    
        #if ccc == 10:
        #    break
    #    break
    e_keys = e_user.keys()
    ne_keys = ne_user.keys()
    #print('echo', stats.pearsonr(e_user.values(), e_source.values()))
    #print('necho', stats.pearsonr(ne_user.values(), ne_source.values()))
    print('echo', stats.pearsonr([e_user[key] for key in e_keys], [e_source[key] for key in e_keys]))
    print('necho', stats.pearsonr([ne_user[key] for key in ne_keys], [ne_source[key] for key in ne_keys]))

    with open('Data/user_polarity_content_polarity.json', 'w') as f:
        json.dump({'e_user':e_user, 'ne_user' : ne_user, 'e_source' : e_source, 'ne_source' : ne_source},f)
Beispiel #3
0
def propagation_to_depth_politic(filename):
    #get echo chamber cascade only 
    echo_chamber_values = {}
    non_echo_chamber_values = {} 
    echo_politics = {}; echo_non_politics = {}; non_echo_politics = {}; non_echo_non_politics = {};
    ranked_echo_politics = {}; ranked_echo_non_politics = {}
    politics = {}
    non_politics = {}
    for item in ['time_depth', 'user_depth']:
        echo_chamber_values[item] = {}
        non_echo_chamber_values[item] = {}
        politics[item] = {}
        non_politics[item] = {}
        echo_politics[item] = {}
        echo_non_politics[item] = {}
        non_echo_politics[item] = {}
        non_echo_non_politics[item] = {}
        ranked_echo_politics[item] = {}
        ranked_echo_non_politics[item] = {}


        for i in range(1,20):
            echo_chamber_values[item][i] = []
            non_echo_chamber_values[item][i] = []
            politics[item][i] = []
            non_politics[item][i] = []
            echo_politics[item][i] = []
            echo_non_politics[item][i] = []
            non_echo_politics[item][i] = []
            non_echo_non_politics[item][i] = []
            ranked_echo_politics[item][i] = []
            ranked_echo_non_politics[item][i] = []

   
    echo_chamber_cascade_root = {} #cascade which echo chamber users participated in 
    ranked_echo_chamber_cascade_root = {} #cascade which echo chamber users participated in 
    cascade_veracity = {}
    echo_chamber_users = {}
    politic_cascade = {} #contain cascade root of political rumors
    non_politic_cascade = {}
   
    echo_chamber_users = e_util.get_echo_chamber_users(filename)
    with open('Data/degree_ranked_users.json', 'r') as f:
        ranked_echo_chamber_users = json.load(f)
    print(ranked_echo_chamber_users.keys())

    files = os.listdir('RetweetNew')
    #for postid in echo_chamber_users.keys():
    for postid in files:
        
        #if not get_veracity(postid, 'Mixture,Mostly True,Mostly False'):
        #if not get_veracity(postid, 'False'):
        #    continue

        v = veracity_type(postid).title()
        #get origin tweet of echo chamber user 

        politic_num = 0 
        if util.is_politics(postid):
            politic_num = 1
        elif util.is_non_politics(postid):
            politic_num = 2

        with open('RetweetNew/%s'%postid, 'r') as f:
            tweets = json.load(f)

            for tweet in tweets.values():
                try:
                    if tweet['user'] in echo_chamber_users[postid].keys():
                        echo_chamber_cascade_root[tweet['origin_tweet']] = 1
                except KeyError :
                    pass

                try:
                    if tweet['user'] in ranked_echo_chamber_users[postid].keys():
                        ranked_echo_chamber_cascade_root[tweet['origin_tweet']] = 1
                except KeyError :
                    pass

                cascade_veracity[tweet['origin_tweet']] = v
                if politic_num == 1:
                    politic_cascade[tweet['origin_tweet']] = 1
                elif politic_num ==2 :
                    non_politic_cascade[tweet['origin_tweet']] = 1

    #print(set(cascade_veracity.values()))
    print("echo chamber cascade extraction done")

    _, _, time_depth, _, user_depth = get_depth_time_series('False') 

    print("time series data load done ")
    echo_chamber_cascades = echo_chamber_cascade_root.keys()
    ranked_echo_chamber_cascades = ranked_echo_chamber_cascade_root.keys()

    print(len(ranked_echo_chamber_cascades))
    political_cascades = politic_cascade.keys()
    non_political_cascades = non_politic_cascade.keys()
    #print('echo chamber cascades')
    #print(echo_chamber_cascades)

    for key in time_depth.keys():

        if key in political_cascades:
            #political rumors
            if key in echo_chamber_cascades:
                echo = 1
            else : 
                echo = 0 

            if key in ranked_echo_chamber_cascades:
                ranked = 1
            else:
                ranked = 0
            for i in range(1, max(time_depth[key].keys())):
                try:
                    politics['time_depth'][i].append(time_depth[key][i])
                    politics['user_depth'][i].append(user_depth[key][i])
                    if echo == 1:
                        #echo political
                        echo_politics['time_depth'][i].append(time_depth[key][i])
                        echo_politics['user_depth'][i].append(user_depth[key][i])
                    else: 
                        #non echo political
                        non_echo_politics['time_depth'][i].append(time_depth[key][i])
                        non_echo_politics['user_depth'][i].append(user_depth[key][i])

                    if ranked == 1:
                        #echo political
                        ranked_echo_politics['time_depth'][i].append(time_depth[key][i])
                        ranked_echo_politics['user_depth'][i].append(user_depth[key][i])


                except KeyError :
                    pass

        if key in non_political_cascades:
            if key in echo_chamber_cascades:
                echo = 1
            else : 
                echo = 0 
            if key in ranked_echo_chamber_cascades:
                ranked = 1
                print(222)
            else:
                ranked = 0

            for i in range(1, max(time_depth[key].keys())):
                try:
                    non_politics['time_depth'][i].append(time_depth[key][i])
                    non_politics['user_depth'][i].append(user_depth[key][i])
                   
                    if echo == 1:
                        #echo political
                        echo_non_politics['time_depth'][i].append(time_depth[key][i])
                        echo_non_politics['user_depth'][i].append(user_depth[key][i])
                    else: 
                        #non echo political
                        non_echo_non_politics['time_depth'][i].append(time_depth[key][i])
                        non_echo_non_politics['user_depth'][i].append(user_depth[key][i])

                    if ranked == 1:
                        #echo non political
                        print(222)
                        ranked_echo_non_politics['time_depth'][i].append(time_depth[key][i])
                        ranked_echo_non_politics['user_depth'][i].append(user_depth[key][i])

                except KeyError :
                    pass
       
       
    #draw time to depth, user to depth of cascade for echo chamber users participated or non echo chamer users participated 
    draw_time_to_depth_echo_chamber([politics['time_depth'], non_politics['time_depth']], ['Politics', 'Other'], 'Median Minutes', 'time_depth_politics_line')
    draw_time_to_depth_echo_chamber([politics['user_depth'], non_politics['user_depth']], ['Politics', 'Other'], 'Median Unique Users', 'user_depth_politics_line')
    
    #compare echo chamber users participated cascades and non echo chamber users participated cascades for politics and others 
    draw_time_to_depth_echo_chamber([echo_politics['time_depth'], non_echo_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Minutes', 'time_depth_politics_echo_line')
    draw_time_to_depth_echo_chamber([echo_politics['user_depth'], non_echo_politics['user_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Unique Users', 'user_depth_politics_echo_line')
    draw_time_to_depth_echo_chamber([echo_non_politics['time_depth'], non_echo_non_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Minutes', 'time_depth_non_politics_echo_line')
    draw_time_to_depth_echo_chamber([echo_non_politics['user_depth'], non_echo_non_politics['user_depth']], ['Echo Chamber', 'Non Echo Chamber'], 'Median Unique Users', 'user_depth_non_politics_echo_line')

    draw_time_to_depth_echo_chamber([echo_politics['time_depth'], non_echo_politics['time_depth'], ranked_echo_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber', 'Ranked Echo Chamber'], 'Median Minutes', 'time_depth_politics_echo_line_ranked')
    draw_time_to_depth_echo_chamber([echo_non_politics['time_depth'], non_echo_non_politics['time_depth'], ranked_echo_non_politics['time_depth']], ['Echo Chamber', 'Non Echo Chamber', 'Ranked Echo Chamber'], 'Median Minutes', 'time_depth_non_politics_echo_line_ranked')
Beispiel #4
0
def time_to_depth_echo_chamber(filename):
    
    _, _, time_depth, _, user_depth = get_depth_time_series(None)    
    print(len(time_depth))
    #with open('Data/time_series_data.json', 'w') as f:
    #    json.dump({'time_depth' : time_depth, 'user_depth' : user_depth}, f)
    #with open('Data/time_series_data.json', 'r') as f:
    #    data = json.load(f)

    #time_depth = data['time_depth']
    #user_depth = data['user_depth']

    print("time series data load done ")
    echo_chamber_values = {}
    non_echo_chamber_values = {} 
   
    for item in ['time_depth', 'user_depth']:
        echo_chamber_values[item] = {}
        non_echo_chamber_values[item] = {}

        for i in range(1,20):
            echo_chamber_values[item][i] = []
            non_echo_chamber_values[item][i] = []
    Bot = bot.load_bot()
    echo_chamber_cascade_root = {}
    cascade_veracity = {}
    echo_chamber_users = e_util.get_echo_chamber_users(filename)
   
    files = os.listdir('RetweetNew')
    #collect echo chamber user participate cascade 
    #for postid in echo_chamber_users.keys():
    for postid in files:
        v = veracity_type(postid).title()
        
        #get origin tweet of echo chamber user 
        with open('RetweetNew/%s'%postid, 'r') as f:
            tweets = json.load(f)

            for tweet in tweets.values():
                try:
                    #if tweet['user'] in echo_chamber_users[postid].keys():
                    origin = tweet['origin']
                    otid = tweet['origin_tweet']
                    #if origin in echo_chamber_users[postid].keys():
                    if tweet['user'] in echo_chamber_users[postid].keys():
                        echo_chamber_cascade_root[tweet['origin_tweet']] = 1
                except KeyError :
                    pass

                cascade_veracity[tweet['origin_tweet']] = v
    
    print("echo chamber cascade extraction done")

    echo_chamber_cascades = echo_chamber_cascade_root.keys()

    print('echo chamber cascades')
    #print(echo_chamber_cascades)

    e = {};  n = {}; r = {}; #echo, non echo, ranked echo 
    for item in ['True', 'False', 'Mixed']:
        e[item] = {}
        n[item] = {}
        r[item] = {}
        
        for d_type in ['user_depth', 'time_depth']:
            e[item][d_type] = {}
            n[item][d_type] = {}
            r[item][d_type] = {}

            for i in range(1, 20):
                e[item][d_type][i] = []
                n[item][d_type][i] = []
                r[item][d_type][i] = []

    for key in time_depth.keys():
        v = cascade_veracity[key]
        if v !='True' and  v != 'False':
            v = 'Mixed'

        if key in echo_chamber_cascades:
            #for i in range(1, max(time_depth[key].keys())+1):
            for i in range(1, max(time_depth[key].keys())+1):
                try:
                    echo_chamber_values['time_depth'][i].append(time_depth[key][i])
                    echo_chamber_values['user_depth'][i].append(user_depth[key][i])
                    e[v]['time_depth'][i].append(time_depth[key][i])
                    e[v]['user_depth'][i].append(user_depth[key][i])

                except KeyError:
                    pass
        else:
            for i in range(1, max(time_depth[key].keys())+1):
                try :
                    non_echo_chamber_values['time_depth'][i].append(time_depth[key][i])
                    non_echo_chamber_values['user_depth'][i].append(user_depth[key][i])
                    n[v]['time_depth'][i].append(time_depth[key][i])
                    n[v]['user_depth'][i].append(user_depth[key][i])

                except KeyError:
                    pass

    box = BoxPlot(1)
    box.set_multiple_data([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']])
    box.set_ylog()
    box.set_label('Depth', 'Minutes to Depth')
    box.save_image('%s/time_depth_echo_chamber_box.png'%foldername)
    print(echo_chamber_values['time_depth'])    

    #draw time to depth, user to depth of cascade for echo chamber users participated or non echo chamer users participated 
    with open('Data/Figure/5_2_1.json', 'w') as f:
        json.dump([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], f)

    
    draw_time_to_depth_echo_chamber([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], ['echo chamber', 'no echo chamber'], 'median minutes', 'time_depth_echo_chamber_line')
    draw_time_to_depth_echo_chamber([echo_chamber_values['user_depth'], non_echo_chamber_values['user_depth']], ['echo chamber', 'no echo chamber'], 'median unique users', 'user_depth_echo_chamber_line')
    
    with open('Data/Figure/5_2_time.json', 'w') as f:
        json.dump({'e':echo_chamber_values['time_depth'][1], 'ne':non_echo_chamber_values['time_depth'][1]}, f)

    #draw cdf with top retweet 
    cdf = CDFPlot()
    cdf.set_label('Propagation Time', 'CDF')
    cdf.set_log(True)
    #cdf.set_ylog()
    cdf.set_data(echo_chamber_values['time_depth'][1], '')
    cdf.set_data(non_echo_chamber_values['time_depth'][1], '')
    cdf.save_image('Image/20181105/depth_propagation_time_cdf.png')

    """
def echo_chamber_diversity(filename):
    Bot = bot.load_bot()
    dirname = 'Retweet/'
    files = os.listdir(dirname)
    
    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)


    
    echo_tweet_diversity = []; echo_source_diversity = [];
    necho_tweet_diversity = []; necho_source_diversity = [];
    for postid in files:

        with open(dirname + postid) as f:
            tweets = json.load(f)

        non_echo_users = {}
        for tweet in tweets.values():
            user = tweet['user']

            #non echo chamber collect
            if not user in echo_chamber_users[postid]:
                non_echo_users[user] = 1

        print(len(echo_chamber_users[postid]), len(non_echo_users))

        timeline_dir = '../Timeline/'
        #collect echo chamber users' source diversity
        err = 0; nerr = 0
        for user in echo_chamber_users[postid]:
            try:
                with open(timeline_dir + user, 'r') as f:
                    user_tweets = json.load(f)
            except IOError as e:
                #print(e)
                err +=1
                continue

            tweet_diversity, source_diversity = get_diversity(user_tweets)

            if tweet_diversity != None:
                echo_tweet_diversity.append(tweet_diversity)
            if source_diversity != None:
                echo_source_diversity.append(source_diversity)

        for user in non_echo_users:
            try:
                with open(timeline_dir + user, 'r') as f:
                    user_tweets = json.load(f)
            except IOError as e:
                #print(e)
                nerr += 1
                continue

            tweet_diversity, source_diversity = get_diversity(user_tweets)
            if tweet_diversity != None:
                necho_tweet_diversity.append(tweet_diversity)
            if source_diversity != None:
                necho_source_diversity.append(source_diversity)

        #print(err, nerr)
        #break
                
    #CDF
    cdf = CDFPlot()
    cdf.set_label('Retweet Origin Diversity', 'CDF')
    #cdf.set_log(True)
    cdf.set_data(echo_tweet_diversity, 'Echo Chamber')
    cdf.set_data(necho_tweet_diversity, 'Non Echo Chamber')
    cdf.set_legends(['Echo CHamber', 'Non Echo CHamber'], 'User Type')
    cdf.save_image('Image/20181002/source_diversity_retweet_cdf.png')

    cdf = CDFPlot()
    cdf.set_label('Source News Diversity', 'CDF')
    #cdf.set_log(True)
    cdf.set_data(echo_source_diversity, 'Echo Chamber')
    cdf.set_data(necho_source_diversity, 'Non Echo Chamber')
    cdf.set_legends(['Echo CHamber', 'Non Echo CHamber'], 'User Type')
    cdf.save_image('Image/20181002/source_diversity_news_cdf.png')

    #BoxPlot
    box = BoxPlot(1)
    box.set_data([echo_tweet_diversity, necho_tweet_diversity],'')
    box.set_xticks(['Echo Chamber', 'Non Echo Chamber', 'All'])
    box.set_label('', 'Retweet Origin Diversity')
    box.save_image('Image/20181002/source_diversity_retweet.png')

    box = BoxPlot(1)
    box.set_data([echo_source_diversity, necho_source_diversity],'')
    box.set_xticks(['Echo Chamber', 'Non Echo Chamber', 'All'])
    box.set_label('', 'Source News Diversity')
    box.save_image('Image/20181002/source_diversity_news.png')
def political_alignment():
    source_politic_score = {}
    echo_chamber_users = e_util.get_echo_chamber_users(filename)

    #load source information file
    with open('Data/top500.tab', 'r') as f:
        i = 0
        for row in f:
            if i != 0:
                items = row.split('\t')
                items[0] = items[0].replace("www.", "")
                source_politic_score[items[0]] = (float(items[1]) + 1) / 2 
            i += 1
    #print(source_politic_score.keys())
    source_list = source_politic_score.keys()
    #check users political score in the rumor propagation
    files = os.listdir(dir_name)
    timeline_dir = '../Timeline/'

    user_content_polarity = {} #user - polarity 
    postid_content_polarity = {}
    all_echo_user_score = []; all_non_echo_user_score = []
    all_echo_source_score = []; all_non_echo_source_score = []       

    for postid in files:
        user_score = []; echo_user_score = []; non_echo_user_score = []
        source_score = []; echo_source_score = []; non_echo_source_score = []       

        path = '%s/selective_exposure_%s'%(folder, postid)
        if postid != '142256':
            continue
        postid_content_polarity[postid] = {}
        with open(dir_name + postid, 'r') as f:
            tweets = json.load(f)

        #if len(tweets) > 3000 or len(tweets) < 70:
        #    continue
        print(postid, len(tweets))
        users = list(set([tweet['user'] for tweet in tweets.values()]))
        #collect echo chamber users' source diversity
        err = 0; nerr = 0
        count_zero_users = 0
        for user in users:

            if user_content_polarity.get(user, None) != None:
                postid_content_polarity[postid][user] = user_content_polarity.get(user)
                continue

            try:
                with open(timeline_dir + user, 'r') as f:
                    user_tweets = json.load(f)
            except IOError as e:
                #print(e)
                err +=1
                continue
            except ValueError as e:
                err += 1
                continue
            
            urls, expanded_urls = timeline_urls(user_tweets)
            if len(urls) == 0:
                continue 

            count =0 
            p_sum = 0 
            for url in urls:
                if url in source_list:
                    count += 1
                    idx = source_list.index(url)
                    #print(url, source_politic_score.keys()[idx], source_politic_score[source_politic_score.keys()[idx]])
                    p_sum += source_politic_score[source_list[idx]]
            if count == 0:
                count_zero_users += 1
                continue
            p_mean = round(p_sum / count, 4)
            user_politic_score = round(get_polarity(user),4)
            if user_politic_score != None:
                user_score.append(user_politic_score)
                source_score.append(p_mean)
                if user in echo_chamber_users[postid]:
                    echo_user_score.append(user_politic_score)
                    echo_source_score.append(p_mean)
                    all_echo_user_score.append(user_politic_score)
                    all_echo_source_score.append(p_mean)

                else:
                    non_echo_user_score.append(user_politic_score)
                    non_echo_source_score.append(p_mean)
                    all_non_echo_user_score.append(user_politic_score)
                    all_non_echo_source_score.append(p_mean)


            user_content_polarity[user] = p_mean            
            postid_content_polarity[postid][user] = p_mean
            #calculate until 1000 users cause it takes too much time 
            #if len(user_score) > 1500:
            #    break

        print('count zero users : %s'%count_zero_users)
        print('save selective exposure file')
        filefolder = 'Data/SelectiveExposure/'
        if not os.path.exists(filefolder):
           os.makedirs(filefolder)

    
        print('echo', stats.pearsonr(echo_user_score, echo_source_score)[0])
        print('necho', stats.pearsonr(non_echo_user_score, non_echo_source_score)[0])
    #    break
        datapath = filefolder + postid + '_polarity'
        with open(datapath, 'w') as f :
            json.dump({'necho_user' : non_echo_user_score, 'necho_source' : non_echo_source_score, 'echo_user' : echo_user_score, 'echo_source' : echo_source_score}, f)

        snsplot.draw_echo_plot(non_echo_user_score, non_echo_source_score, echo_user_score, echo_source_score, path)
    
        with open(filefolder + postid, 'w') as f:
            json.dump(postid_content_polarity[postid], f)
       
    print('echo', stats.pearsonr(all_echo_user_score, all_echo_source_score))
    print('necho', stats.pearsonr(all_non_echo_user_score, all_non_echo_source_score))

    with open('Data/user_content_polarity.json', 'w') as f:
        json.dump(user_content_polarity, f)
    with open('Data/user_content_polarity_postid.json', 'w') as f:
        json.dump(postid_content_polarity, f)
def propagation_parent_to_child():
    Bot = bot.load_bot()
    dirname = 'RetweetNew/'
    files = os.listdir(dirname)

    filename = 'Data/echo_chamber2.json'
    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)

    echo_chamber_cascades = {}
    tweet_cache = {}
    '''
    for postid in echo_chamber_users.keys():
        
        users = echo_chamber_users[postid] #echo chamber users 

        with open('RetweetNew/' + postid, 'r') as f:
            tweets = json.load(f)
            tweet_cache[postid] = tweets
            
            for tweet in tweets.values():
                if tweet['user'] in users:
                    root_id = tweet['origin_tweet'] #root tweet id 
                    echo_chamber_cascades[root_id] = 1
        
    echo_chamber_cascades_ids = echo_chamber_cascades.keys()
    '''
    #print(echo_chamber_cascades_ids)
    e_child = {}
    ne_child = {}
    e_time = {}
    ne_time = {}
    ne_time2 = {}
    for i in range(1, 20):
        e_child[i] = []
        ne_child[i] = []
        e_time[i] = {}
        ne_time[i] = {}
        ne_time2[i] = {}

    print(len(echo_chamber_users.keys()))
    for ccc, postid in enumerate(files):
        #if postid != '150232' and  postid != '29947':
        #    continue
        with open(dirname + postid, 'r') as f:
            tweets = json.load(f)
        #tweets = tweet_cache[postid]

        #if not util.is_politics(postid):
        #if not util.is_non_politics(postid):
        #if not util.is_veracity(postid, 'False'):
        #if not util.is_veracity(postid, 'Mixture,Mostly False,Mostly True'):
        #    continue

        #order by timeline
        sort = {}
        for key in tweets.keys():
            tweet = tweets[key]
            sort[key] = parser.parse(tweet['time'])

        #sort by time
        new_list = sorted(sort.items(), key=lambda x: x[1])
        sorted_ids = [item[0] for item in new_list]
        e_users = echo_chamber_users[postid]
        #e_users = echo_chamber_users.get(postid, [])
        print(len(e_users))
        for i, tid in enumerate(sorted_ids):
            tweet = tweets[tid]['tweet']
            parent = tweets[tid]['parent']
            origin = tweets[tid]['origin']
            root = tweets[tid]['origin_tweet']
            cascade = tweets[tid]['cascade']
            userid = tweets[tid]['user']
            ptid = tweets[tid]['parent_tweet']
            if cascade < 2:
                continue

            #bot filter
            if bot.check_bot(Bot, userid) != 0:
                continue

            if userid in e_users:
                e_child[tweets[tid]['depth']].append(tweets[tid]['child'])
            else:
                ne_child[tweets[tid]['depth']].append(tweets[tid]['child'])

            if tweets[tid]['depth'] > 1:
                diff = (parser.parse(tweets[tid]['time']) - parser.parse(
                    tweets[ptid]['time'])).total_seconds() / 60
                if e_time[tweets[ptid]['depth']].get(ptid, -1) > diff:
                    print(e_time[tweets[ptid]['depth']][ptid], diff)

                if parent in e_users:
                    #                if origin in e_users:
                    if e_time[tweets[ptid]['depth']].get(ptid, -1) == -1:
                        e_time[tweets[ptid]['depth']][ptid] = diff
                else:
                    if ne_time[tweets[ptid]['depth']].get(ptid, -1) == -1:
                        ne_time[tweets[ptid]['depth']][ptid] = diff

        #if ccc == 5:
        #    break

    #remove child 0 count
    for i in range(1, 20):
        e_child[i] = [x for x in e_child[i] if x != 0]
        ne_child[i] = [x for x in ne_child[i] if x != 0]

    box = BoxPlot(1)
    box.set_multiple_data([e_child, ne_child])
    box.set_ylog()
    box.set_label('Depth', 'Child Count')
    box.save_image('Image/%s/child_num_wo_propagation.png' % folder)

    for i in range(1, 20):
        e_time[i] = e_time[i].values()
        ne_time[i] = ne_time[i].values()
        ne_time2[i] = ne_time2[i].values()

    #print(e_time)
    #print(ne_time)
    box = BoxPlot(1)
    box.set_multiple_data([e_time, ne_time])
    box.set_ylog()
    box.set_label('Depth', 'Propagation Time')
    box.save_image('Image/%s/child_time_propagation.png' % folder)

    with open('Data/Figure/5_3_1.json', 'w') as f:
        json.dump(
            {
                'e_time': e_time,
                'ne_time': ne_time,
                'e_child': e_child,
                'ne_child': ne_child
            }, f)
def propagation_time_to_group(filename):
    #get all echo chamber users
    #filename = 'Data/echo_chamber2.json'
    Bot = bot.load_bot()
    dirname = 'RetweetNew/'
    files = os.listdir(dirname)

    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)

    echo_p = []
    echo_r = []
    necho_p = []
    necho_r = []
    for ccc, postid in enumerate(files):
        #if postid != '150232' and  postid != '29947':
        #    continue
        with open(dirname + postid, 'r') as f:
            tweets = json.load(f)

        #order by timeline
        sort = {}
        for key in tweets.keys():
            tweet = tweets[key]
            sort[key] = parser.parse(tweet['time'])

        #sort by time
        new_list = sorted(sort.items(), key=lambda x: x[1])
        sorted_ids = [item[0] for item in new_list]

        parent_child = {}
        parent_start = {}
        root_start = {}
        #make one dictionary parent - children
        echo_chamber_parent = {}
        echo_chamber_root = {}
        echo_chamber_tweet = {}
        #print('echo_chamber user num ', len(echo_chamber_users[postid]))
        for i, tid in enumerate(sorted_ids):
            tweet = tweets[tid]['tweet']
            parent = tweets[tid]['parent_tweet']
            root = tweets[tid]['origin_tweet']
            cascade = tweets[tid]['cascade']
            if cascade < 2:
                continue

            #bot filter
            if bot.check_bot(Bot, tweets[tid]['user']) != 0:
                continue

            #save all the parent, root start time

            if root_start.get(root, None) == None:
                root_start[root] = new_list[i][1]

            if parent_start.get(parent, None) == None:
                parent_start[parent] = new_list[i][1]

            if tweets[tid]['user'] in echo_chamber_users[postid]:
                echo_chamber_tweet[tid] = 1
            #if tweets[tid]['parent'] in echo_chamber_users[postid]:
            #    echo_chamber_parent[parent] = 1

            #if tweets[tid]['origin'] in echo_chamber_users[postid]:
            #    echo_chamber_root[root] = 1

        #parent_child_diff[key].append((time-start_time).total_seconds() / 60)
        echo_parent = 0
        necho_parent = 0
        for tweet in tweets.values():
            tid = tweet['tweet']
            pid = tweet['parent_tweet']
            rid = tweet['origin_tweet']

            #bot filter
            if bot.check_bot(Bot, tweets[tid]['user']) != 0:
                continue

            if tid != pid:
                #not root
                r_time = (parser.parse(tweets[tid]['time']) - parser.parse(
                    tweets[rid]['time'])).total_seconds() / 60
                p_time = (parser.parse(tweets[tid]['time']) - parser.parse(
                    tweets[pid]['time'])).total_seconds() / 60

                if tweet['parent_tweet'] in echo_chamber_tweet.keys():
                    echo_p.append(p_time)
                else:
                    necho_p.append(p_time)

                if tweet['origin_tweet'] in echo_chamber_tweet.keys():
                    echo_r.append(r_time)
                else:
                    necho_r.append(r_time)

        if ccc % 10 == 0:
            print(ccc)
    return echo_p, necho_p, echo_r, necho_r
def rumor_propagation_velocity(filename):
    #get all echo chamber users
    #filename = 'Data/echo_chamber2.json'
    Bot = bot.load_bot()
    dirname = 'RetweetNew/'
    files = os.listdir(dirname)

    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)

    echo_v = []
    necho_v = []

    #propagation time to all node's children
    #parent --> last child
    echo_p = {}
    necho_p = {}
    for i in range(1, 20):
        echo_p[i] = []
        necho_p[i] = []

    tweet_depth = {}

    for ccc, postid in enumerate(files):
        with open(dirname + postid, 'r') as f:
            tweets = json.load(f)

        #order by timeline
        sort = {}
        for key in tweets.keys():
            tweet = tweets[key]
            sort[key] = parser.parse(tweet['time'])

        #sort by time
        new_list = sorted(sort.items(), key=lambda x: x[1])
        sorted_ids = [item[0] for item in new_list]

        #make one dictionary parent - children
        parent_child = {}
        echo_chamber_parent = {}
        for i, tid in enumerate(sorted_ids):
            tweet = tweets[tid]['tweet']
            parent = tweets[tid]['parent_tweet']
            cascade = tweets[tid]['cascade']
            if cascade < 2:
                continue

            #bot filter
            if bot.check_bot(Bot, tweets[tid]['user']) != 0:
                continue

            if tweet != parent:

                parent_child[parent] = parent_child.get(parent, [])
                #parent comes always earlier then child
                if len(parent_child[parent]) == 0:
                    #add parent time into index 0
                    parent_child[parent].append(
                        parser.parse(tweets[parent]['time']))
                #time or tweet?

                parent_child[parent].append(new_list[i][1])
                tweet_depth[parent] = tweets[parent]['depth']
            else:
                #root tweet of cascade
                parent_child[parent] = [new_list[i][1]]

            if len(tweets[parent]) != 0:
                #parent is echo chamber or not
                if tweets[tid]['parent'] in echo_chamber_users[postid]:
                    echo_chamber_parent[parent] = 1

        #insert time diff from start time
        parent_child_diff = {}
        parent_child_median_diff = {}
        for key in parent_child.keys():
            times = parent_child[key]
            #print(times)
            #print((max(times) - min(times)).total_seconds() / 60)
            parent_child_diff[key] = (
                (max(times) - min(times)).total_seconds() / 60)

            parent_child_median_diff[key] = []
            for i, time in enumerate(times):
                if i == 0:
                    start_time = time
                    continue
                parent_child_median_diff[key].append(
                    (time - start_time).total_seconds() / 60)

        echo_parent = 0
        necho_parent = 0
        for key in parent_child_diff:
            if key in echo_chamber_parent.keys():
                echo_parent += 1
                #if len(parent_child_diff[key]) == 0:
                if parent_child_diff[key] == 0:
                    continue
                #echo_p[tweet_depth[key]].append(parent_child_diff[key])
                echo_p[tweet_depth[key]].append(
                    np.median(parent_child_median_diff[key]))
                echo_v.append(parent_child_diff[key])
                #echo_v.append(np.median(parent_child_diff[key]))
            else:
                necho_parent += 1
                #if len(parent_child_diff[key]) == 0:
                if parent_child_diff[key] == 0:
                    continue

                #necho_p[tweet_depth[key]].append(parent_child_diff[key])
                necho_p[tweet_depth[key]].append(
                    np.median(parent_child_median_diff[key]))
                necho_v.append(parent_child_diff[key])
                #necho_v.append(np.median(parent_child_diff[key]))
        #print('echo')
        #print(echo_p)
        #print('necho')
        #print(necho_p)
        #if ccc == 10:
        #    break

    return echo_v, necho_v, echo_p, necho_p
def mean_edge_homogeneity2(filename):
    #compare with echo chamber node's edge homogeneity
    echo_chamber_users = {}
    e_homogeneity = []
    ne_homogeneity = []
    retweet_cache = {}
    echo_chamber_users = e_util.get_echo_chamber_users(filename)
    

    parent_child = {}
    files = os.listdir('RetweetNew') #for all cascade
    for postid in files:
        parent_child[postid] = {}
    #for postid in echo_chamber_users.keys():
        if retweet_cache.get(postid, None) == None:
            with open(dir_name + '%s'%postid, 'r') as f:
                tweets = json.load(f)
                retweet_cache[postid] = tweets
        else:
            tweets = retweet_cache[postid]

        echo_users = echo_chamber_users[postid]
        #parent_child[postid] = {} #parent-children
        #make parent - children map 
        for tweet in tweets.values():
            #echo chamber user's edge homogeneity
            if tweet['cascade'] == 1:
                continue

            if tweet['parent'] != tweet['user']:
                #if parent is echo chamber then add only echo chamber users
                parent_child[postid][tweet['parent']] = parent_child[postid].get(tweet['parent'], [])
                if tweet['parent'] in echo_users:
                    #print(tweet['parent'])
                    if tweet['user'] in echo_users:
                        parent_child[postid][tweet['parent']].append(tweet['user'])
                else:
                    if tweet['user'] not in echo_users:
                        parent_child[postid][tweet['parent']].append(tweet['user'])

    #for postid in echo_chamber_users.keys():
    #############
    files = os.listdir('RetweetNew') #for all cascade
    for postid in files:
        #print(postid)
        if retweet_cache.get(postid, None) == None:
            with open(dir_name + '%s'%postid, 'r') as f:
                tweets = retweet_cache[postid]
                retweet_cache[postid] = tweets
        else:
            tweets = retweet_cache[postid]
        
        echo_users = echo_chamber_users[postid]
        
        #print([len(item) for item in parent_child.values()])
        #print(len(parent_child.keys()))
        #print(len(tweets))

        for tweet in tweets.values():
            if parent_child[postid].get(tweet['user'], None) != None:
                
                #check parent and childen are echo chamber group or non-echo chamber group
                parent = tweet['user']
                children = parent_child[postid][parent]
                
                #check echo only has echo users 
                """
                if parent in echo_users:
                    is_echo = True
                else:
                    is_echo = False
                child_count = 0
              
                #print('is echo', is_echo)
                for child in children:
                    if is_echo:
                        if child in echo_users:
                            child_count += 1
                    else:
                        if child not in echo_users:
                            child_count += 1

                #print(is_echo, len(children), child_count)
                #print(parent, children)
                """
                #convert user and children's political score 
                p_score = get_polarity(tweet['user'])
                c_scores =  [get_polarity(c_user) for c_user in parent_child[postid][tweet['user']]]

                c_scores = list(filter(lambda x : x != -999, c_scores))
                if p_score == -999 or len(c_scores) == 0:
                    continue

                multiple = list(map(lambda x: x * p_score , c_scores))
                mean_edge_homogeneity = np.mean(multiple)
                #print('mean', np.mean(multiple))

                if tweet['user'] in echo_users:
                    e_homogeneity.append(mean_edge_homogeneity)
                else:
                    ne_homogeneity.append(mean_edge_homogeneity)
        #break
    with open('Data/Figure/4_2_1.json', 'w') as f:
        json.dump({'e': e_homogeneity, 'ne': ne_homogeneity}, f)
    #pdf.draw_pdf({'e': e_homogeneity, 'ne': ne_homogeneity}, '', ['Echo chamber' , 'Non-echo chamber'], 'Image/%s/echo_mean_edge_homogeneity.png'%foldername)
    
    draw_cdf_plot([e_homogeneity, ne_homogeneity], '', ['Echo chamber', 'Non-echo chamber'], '', 'echo_mean_edge_homogeneity')
    return e_homogeneity
def mean_edge_homogeneity(filename):
    #compare with echo chamber node's edge homogeneity
    echo_chamber_users = {}
    e_homogeneity = []
    ne_homogeneity = []
    retweet_cache = {}
    echo_chamber_users = e_util.get_echo_chamber_users(filename)
    

    parent_child = {}
    for postid in echo_chamber_users.keys():
        if retweet_cache.get(postid, None) == None:
            with open(dir_name + '%s'%postid, 'r') as f:
                tweets = json.load(f)
                retweet_cache[postid] = tweets
        else:
            tweets = retweet_cache[postid]

        parent_child[postid] = {} #parent-children
        #make parent - children map 
        for tweet in tweets.values():
            #echo chamber user's edge homogeneity
            if tweet['cascade'] == 1:
                continue

            if tweet['parent'] != tweet['user']:
                parent_child[postid][tweet['parent']] = parent_child[postid].get(tweet['parent'], [])
                parent_child[postid][tweet['parent']].append(tweet['user'])

    for postid in echo_chamber_users.keys():
    #############
    #files = os.listdir('RetweetNew') #for all cascade
    #for postid in files:
        if retweet_cache.get(postid, None) == None:
            with open(dir_name + '%s'%postid, 'r') as f:
                tweets = retweet_cache[postid]
                retweet_cache[postid] = tweets
        else:
            tweets = retweet_cache[postid]
        
        for tweet in tweets.values():
            if parent_child[postid].get(tweet['user'], None) != None:
                
                #convert user and children's political score 
                p_score = get_polarity(tweet['user'])
                c_scores =  [get_polarity(c_user) for c_user in parent_child[postid][tweet['user']]]

                c_scores = list(filter(lambda x : x != -999, c_scores))
                if p_score == -999 or len(c_scores) == 0:
                    continue

                multiple = list(map(lambda x: x * p_score , c_scores))
                mean_edge_homogeneity = np.mean(multiple)
                #print('mean', np.mean(multiple))

                if tweet['user'] in echo_chamber_users[postid]:
                    e_homogeneity.append(mean_edge_homogeneity)
                else:
                    ne_homogeneity.append(mean_edge_homogeneity)
    #with open('Data/Figure/4_2_1.json', 'w') as f:
    #    json.dump({'e': e_homogeneity, 'ne': ne_homogeneity}, f)
    #pdf.draw_pdf({'e': e_homogeneity, 'ne': ne_homogeneity}, '', ['Echo chamber' , 'Non-echo chamber'], 'Image/%s/mean_edge_homogeneity.png'%foldername)
    pdf.draw_pdf({'e': e_homogeneity, 'ne': ne_homogeneity}, '', ['Echo chamber' , 'Non-echo chamber'], 'Image/%s/mean_edge_homogeneity.png'%foldername)
    draw_cdf_plot([e_homogeneity, ne_homogeneity], '', ['Echo chamber', 'Non-echo chamber'], '', 'mean_edge_homogeneity')
    return e_homogeneity