def top_retweeted_users():

    Bot = bot.load_bot()
    dir_name = "RetweetNew/"
    files = os.listdir(dir_name)
    tweet_num = 0
    users = {}
    cascade  = {}
    one_cascade = {}
    all_retweet = {}
    all_retweet_num = 0
    for postid in files:
        with open(dir_name + postid, 'r') as f:
            tweets = json.load(f)
        for tweet in tweets.values():
            user = tweet['user']      
            origin = tweet['origin_tweet']
            cascade[origin] = 1
            if tweet['cascade'] == 1:
                one_cascade[origin] = 1
            if bot.check_bot(Bot, user) == 0:
                users[user] = users.get(user, 0) + tweet['child']
                tweet_num += 1
                all_retweet_num += tweet['child']

    print('all users ' ,len(users))
    print('all tweets ' , tweet_num)
    print('all cascades ' , len(cascade))
    print('one cascade ', len(one_cascade))
    print('all retweet num ' , all_retweet_num)
    with open('Data/top_retweeted_users', 'w') as f:
        json.dump(users, f)
def top_participated_users(users):
    Bot = bot.load_bot()
    for key in user_participation:
        user_participation[key] #postids 

    sort = sorted(user_participation, key = lambda k : len(user_participation[k]), reverse=True)

    top_100 = []
    top_0_1 = []
    top_1 = []
    for i, item in enumerate(sort):
        #print(item, screen_name(item), len(user_participation[item]), bot.check_bot(Bot, item))

        if bot.check_bot(Bot, item) == 0:
            if i < 200:
                top_0_1.append(item)

            if i < 2000:
                top_1.append(item)
            else:
                print('top 1% ', len(user_participation[item]))
                break
        
            if i < 100:
                top_100.append(item)
def political_alignment_pearson():
    with open('Data/user_content_polarity.json', 'r') as f:
        content_polarity = json.load(f)

    echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json')

    files = os.listdir('RetweetNew')
    e_user = {}
    ne_user = {}
    e_source = {}
    ne_source = {}
    Bot = bot.load_bot()
    for ccc, postid in enumerate(files):
        with open('RetweetNew/' + postid, 'r') as f:
            tweets = json.load(f)
        print(ccc, postid, len(tweets))
        echo_users = echo_chamber_users[postid]
        for tweet in tweets.values():
            user = tweet['user']

            
            if bot.check_bot(Bot, user) == 1:
                continue
            
            if e_user.get(user, None) != None or ne_user.get(user, None) != None:
                continue

            user_politic_score = round(get_polarity(user),4)
            content_politic_score = content_polarity.get(user, None)

            if user_politic_score != None and content_politic_score != None:

                if user in echo_users:
                    e_user[user] = user_politic_score
                    e_source[user] = content_politic_score
                else:
                    ne_user[user] = user_politic_score
                    ne_source[user] = content_politic_score
    
        #if ccc == 10:
        #    break
    #    break
    e_keys = e_user.keys()
    ne_keys = ne_user.keys()
    #print('echo', stats.pearsonr(e_user.values(), e_source.values()))
    #print('necho', stats.pearsonr(ne_user.values(), ne_source.values()))
    print('echo', stats.pearsonr([e_user[key] for key in e_keys], [e_source[key] for key in e_keys]))
    print('necho', stats.pearsonr([ne_user[key] for key in ne_keys], [ne_source[key] for key in ne_keys]))

    with open('Data/user_polarity_content_polarity.json', 'w') as f:
        json.dump({'e_user':e_user, 'ne_user' : ne_user, 'e_source' : e_source, 'ne_source' : ne_source},f)
def bot_participation():
    Bot = bot.load_bot()

    dir_name = "RetweetNew/"
    files = os.listdir(dir_name)
    bot_list = []
    for postid in files:
        with open(dir_name + postid, 'r') as f:
            tweets = json.load(f)
        users = [tweet['user'] for tweet in tweets.values()]
        bots = [bot.check_bot(Bot, user) for user in users]
        bot_list.append(bots.count(1) / bots.count(0))

    box = BoxPlot(1)
    box.set_data(bot_list, '')
    box.set_xticks('bot_ratio')
    box.save_image('Image/bot_ratio_box.png')
Beispiel #5
0
def update():
    """ Update retweet graph with 
        Cascade, Bot information
    """
    #cascade calculation
    cascade = {}
    child = {}
    for postid in files:
        cascade[postid] = {}
        child[postid] = {}
        with open(dir_name + postid, 'r') as f:
            tweets = json.load(f)

            for key in tweets.keys():
                tweet = tweets[key]
                origin = tweet['origin_tweet']
                cascade[postid][origin] = cascade[postid].get(origin, 0) + 1
                parent_tweet = tweet['parent_tweet']
                if parent_tweet != tweet['tweet']:
                    child[postid][parent_tweet] = child[postid].get(parent_tweet, 0) + 1 

    #update
    Bot = bot.load_bot()
    for postid in files:
        print(postid)
        unique_origin = {}
        with open(dir_name + postid, 'r') as f:
            tweets = json.load(f)
            #print(len(tweets))
            #print(cascade[postid])
            for tweet in tweets.values():
                tweet['cascade'] = cascade[postid][tweet['origin_tweet']]
                tweet['bot'] = bot.check_bot(Bot, tweet['user'])
                tweet['child'] = child[postid].get(tweet['tweet'], 0)
                unique_origin[tweet['origin_tweet']] = 1
        print('unique root', len(unique_origin))
        #for key in unique_origin.keys():
            #print(key)
        #    sub_tree_num(tweets, key)
    
        #with open(postid, 'w') as f:
        #    json.dump(tweets, f)
        with open(dir_name+postid, 'w') as f:
            json.dump(tweets, f)
Beispiel #6
0
def get_echo_chamber_users(file_name):
    #file_name = 'Data/echo_chamber2.json'
    print(file_name)
    if 'echo_chamber2.json' in file_name:
        if os.path.exists('Data/echo_chamber_users2.json'):
            with open('Data/echo_chamber_users2.json', 'r') as f:
                echo_chamber_users = json.load(f)
            print('echo chamber size %s' % len(echo_chamber_users))
            return echo_chamber_users

    with open(file_name) as f:
        echo_chambers = json.load(f)

    Bot = bot.load_bot()

    echo_chamber_users = {}
    count = 0
    for key in echo_chambers:
        #print(key)
        users = echo_chambers[key]

        postids = key.split('_')

        #bot check
        for postid in postids:
            for user in users:
                if bot.check_bot(Bot, user) == 0:
                    echo_chamber_users[postid] = echo_chamber_users.get(
                        postid, {})
                    echo_chamber_users[postid][user] = 1
        count += 1

    print('echo chamber size %s' % count)
    with open('Data/echo_chamber_users2.json', 'w') as f:
        json.dump(echo_chamber_users, f)

    return echo_chamber_users
def propagation_parent_to_child():
    Bot = bot.load_bot()
    dirname = 'RetweetNew/'
    files = os.listdir(dirname)

    filename = 'Data/echo_chamber2.json'
    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)

    echo_chamber_cascades = {}
    tweet_cache = {}
    '''
    for postid in echo_chamber_users.keys():
        
        users = echo_chamber_users[postid] #echo chamber users 

        with open('RetweetNew/' + postid, 'r') as f:
            tweets = json.load(f)
            tweet_cache[postid] = tweets
            
            for tweet in tweets.values():
                if tweet['user'] in users:
                    root_id = tweet['origin_tweet'] #root tweet id 
                    echo_chamber_cascades[root_id] = 1
        
    echo_chamber_cascades_ids = echo_chamber_cascades.keys()
    '''
    #print(echo_chamber_cascades_ids)
    e_child = {}
    ne_child = {}
    e_time = {}
    ne_time = {}
    ne_time2 = {}
    for i in range(1, 20):
        e_child[i] = []
        ne_child[i] = []
        e_time[i] = {}
        ne_time[i] = {}
        ne_time2[i] = {}

    print(len(echo_chamber_users.keys()))
    for ccc, postid in enumerate(files):
        #if postid != '150232' and  postid != '29947':
        #    continue
        with open(dirname + postid, 'r') as f:
            tweets = json.load(f)
        #tweets = tweet_cache[postid]

        #if not util.is_politics(postid):
        #if not util.is_non_politics(postid):
        #if not util.is_veracity(postid, 'False'):
        #if not util.is_veracity(postid, 'Mixture,Mostly False,Mostly True'):
        #    continue

        #order by timeline
        sort = {}
        for key in tweets.keys():
            tweet = tweets[key]
            sort[key] = parser.parse(tweet['time'])

        #sort by time
        new_list = sorted(sort.items(), key=lambda x: x[1])
        sorted_ids = [item[0] for item in new_list]
        e_users = echo_chamber_users[postid]
        #e_users = echo_chamber_users.get(postid, [])
        print(len(e_users))
        for i, tid in enumerate(sorted_ids):
            tweet = tweets[tid]['tweet']
            parent = tweets[tid]['parent']
            origin = tweets[tid]['origin']
            root = tweets[tid]['origin_tweet']
            cascade = tweets[tid]['cascade']
            userid = tweets[tid]['user']
            ptid = tweets[tid]['parent_tweet']
            if cascade < 2:
                continue

            #bot filter
            if bot.check_bot(Bot, userid) != 0:
                continue

            if userid in e_users:
                e_child[tweets[tid]['depth']].append(tweets[tid]['child'])
            else:
                ne_child[tweets[tid]['depth']].append(tweets[tid]['child'])

            if tweets[tid]['depth'] > 1:
                diff = (parser.parse(tweets[tid]['time']) - parser.parse(
                    tweets[ptid]['time'])).total_seconds() / 60
                if e_time[tweets[ptid]['depth']].get(ptid, -1) > diff:
                    print(e_time[tweets[ptid]['depth']][ptid], diff)

                if parent in e_users:
                    #                if origin in e_users:
                    if e_time[tweets[ptid]['depth']].get(ptid, -1) == -1:
                        e_time[tweets[ptid]['depth']][ptid] = diff
                else:
                    if ne_time[tweets[ptid]['depth']].get(ptid, -1) == -1:
                        ne_time[tweets[ptid]['depth']][ptid] = diff

        #if ccc == 5:
        #    break

    #remove child 0 count
    for i in range(1, 20):
        e_child[i] = [x for x in e_child[i] if x != 0]
        ne_child[i] = [x for x in ne_child[i] if x != 0]

    box = BoxPlot(1)
    box.set_multiple_data([e_child, ne_child])
    box.set_ylog()
    box.set_label('Depth', 'Child Count')
    box.save_image('Image/%s/child_num_wo_propagation.png' % folder)

    for i in range(1, 20):
        e_time[i] = e_time[i].values()
        ne_time[i] = ne_time[i].values()
        ne_time2[i] = ne_time2[i].values()

    #print(e_time)
    #print(ne_time)
    box = BoxPlot(1)
    box.set_multiple_data([e_time, ne_time])
    box.set_ylog()
    box.set_label('Depth', 'Propagation Time')
    box.save_image('Image/%s/child_time_propagation.png' % folder)

    with open('Data/Figure/5_3_1.json', 'w') as f:
        json.dump(
            {
                'e_time': e_time,
                'ne_time': ne_time,
                'e_child': e_child,
                'ne_child': ne_child
            }, f)
def propagation_time_to_group(filename):
    #get all echo chamber users
    #filename = 'Data/echo_chamber2.json'
    Bot = bot.load_bot()
    dirname = 'RetweetNew/'
    files = os.listdir(dirname)

    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)

    echo_p = []
    echo_r = []
    necho_p = []
    necho_r = []
    for ccc, postid in enumerate(files):
        #if postid != '150232' and  postid != '29947':
        #    continue
        with open(dirname + postid, 'r') as f:
            tweets = json.load(f)

        #order by timeline
        sort = {}
        for key in tweets.keys():
            tweet = tweets[key]
            sort[key] = parser.parse(tweet['time'])

        #sort by time
        new_list = sorted(sort.items(), key=lambda x: x[1])
        sorted_ids = [item[0] for item in new_list]

        parent_child = {}
        parent_start = {}
        root_start = {}
        #make one dictionary parent - children
        echo_chamber_parent = {}
        echo_chamber_root = {}
        echo_chamber_tweet = {}
        #print('echo_chamber user num ', len(echo_chamber_users[postid]))
        for i, tid in enumerate(sorted_ids):
            tweet = tweets[tid]['tweet']
            parent = tweets[tid]['parent_tweet']
            root = tweets[tid]['origin_tweet']
            cascade = tweets[tid]['cascade']
            if cascade < 2:
                continue

            #bot filter
            if bot.check_bot(Bot, tweets[tid]['user']) != 0:
                continue

            #save all the parent, root start time

            if root_start.get(root, None) == None:
                root_start[root] = new_list[i][1]

            if parent_start.get(parent, None) == None:
                parent_start[parent] = new_list[i][1]

            if tweets[tid]['user'] in echo_chamber_users[postid]:
                echo_chamber_tweet[tid] = 1
            #if tweets[tid]['parent'] in echo_chamber_users[postid]:
            #    echo_chamber_parent[parent] = 1

            #if tweets[tid]['origin'] in echo_chamber_users[postid]:
            #    echo_chamber_root[root] = 1

        #parent_child_diff[key].append((time-start_time).total_seconds() / 60)
        echo_parent = 0
        necho_parent = 0
        for tweet in tweets.values():
            tid = tweet['tweet']
            pid = tweet['parent_tweet']
            rid = tweet['origin_tweet']

            #bot filter
            if bot.check_bot(Bot, tweets[tid]['user']) != 0:
                continue

            if tid != pid:
                #not root
                r_time = (parser.parse(tweets[tid]['time']) - parser.parse(
                    tweets[rid]['time'])).total_seconds() / 60
                p_time = (parser.parse(tweets[tid]['time']) - parser.parse(
                    tweets[pid]['time'])).total_seconds() / 60

                if tweet['parent_tweet'] in echo_chamber_tweet.keys():
                    echo_p.append(p_time)
                else:
                    necho_p.append(p_time)

                if tweet['origin_tweet'] in echo_chamber_tweet.keys():
                    echo_r.append(r_time)
                else:
                    necho_r.append(r_time)

        if ccc % 10 == 0:
            print(ccc)
    return echo_p, necho_p, echo_r, necho_r
def rumor_propagation_velocity(filename):
    #get all echo chamber users
    #filename = 'Data/echo_chamber2.json'
    Bot = bot.load_bot()
    dirname = 'RetweetNew/'
    files = os.listdir(dirname)

    if filename == None:
        echo_chamber_users = {}
        for postid in files:
            echo_chamber_users[postid] = {}
    else:
        echo_chamber_users = e_util.get_echo_chamber_users(filename)

    echo_v = []
    necho_v = []

    #propagation time to all node's children
    #parent --> last child
    echo_p = {}
    necho_p = {}
    for i in range(1, 20):
        echo_p[i] = []
        necho_p[i] = []

    tweet_depth = {}

    for ccc, postid in enumerate(files):
        with open(dirname + postid, 'r') as f:
            tweets = json.load(f)

        #order by timeline
        sort = {}
        for key in tweets.keys():
            tweet = tweets[key]
            sort[key] = parser.parse(tweet['time'])

        #sort by time
        new_list = sorted(sort.items(), key=lambda x: x[1])
        sorted_ids = [item[0] for item in new_list]

        #make one dictionary parent - children
        parent_child = {}
        echo_chamber_parent = {}
        for i, tid in enumerate(sorted_ids):
            tweet = tweets[tid]['tweet']
            parent = tweets[tid]['parent_tweet']
            cascade = tweets[tid]['cascade']
            if cascade < 2:
                continue

            #bot filter
            if bot.check_bot(Bot, tweets[tid]['user']) != 0:
                continue

            if tweet != parent:

                parent_child[parent] = parent_child.get(parent, [])
                #parent comes always earlier then child
                if len(parent_child[parent]) == 0:
                    #add parent time into index 0
                    parent_child[parent].append(
                        parser.parse(tweets[parent]['time']))
                #time or tweet?

                parent_child[parent].append(new_list[i][1])
                tweet_depth[parent] = tweets[parent]['depth']
            else:
                #root tweet of cascade
                parent_child[parent] = [new_list[i][1]]

            if len(tweets[parent]) != 0:
                #parent is echo chamber or not
                if tweets[tid]['parent'] in echo_chamber_users[postid]:
                    echo_chamber_parent[parent] = 1

        #insert time diff from start time
        parent_child_diff = {}
        parent_child_median_diff = {}
        for key in parent_child.keys():
            times = parent_child[key]
            #print(times)
            #print((max(times) - min(times)).total_seconds() / 60)
            parent_child_diff[key] = (
                (max(times) - min(times)).total_seconds() / 60)

            parent_child_median_diff[key] = []
            for i, time in enumerate(times):
                if i == 0:
                    start_time = time
                    continue
                parent_child_median_diff[key].append(
                    (time - start_time).total_seconds() / 60)

        echo_parent = 0
        necho_parent = 0
        for key in parent_child_diff:
            if key in echo_chamber_parent.keys():
                echo_parent += 1
                #if len(parent_child_diff[key]) == 0:
                if parent_child_diff[key] == 0:
                    continue
                #echo_p[tweet_depth[key]].append(parent_child_diff[key])
                echo_p[tweet_depth[key]].append(
                    np.median(parent_child_median_diff[key]))
                echo_v.append(parent_child_diff[key])
                #echo_v.append(np.median(parent_child_diff[key]))
            else:
                necho_parent += 1
                #if len(parent_child_diff[key]) == 0:
                if parent_child_diff[key] == 0:
                    continue

                #necho_p[tweet_depth[key]].append(parent_child_diff[key])
                necho_p[tweet_depth[key]].append(
                    np.median(parent_child_median_diff[key]))
                necho_v.append(parent_child_diff[key])
                #necho_v.append(np.median(parent_child_diff[key]))
        #print('echo')
        #print(echo_p)
        #print('necho')
        #print(necho_p)
        #if ccc == 10:
        #    break

    return echo_v, necho_v, echo_p, necho_p
Beispiel #10
0
def get_tweet(path):
    ready = False
    with open(path, 'r') as f:
        lines = fileinput.FileInput(path)

    t = {}
    unique_u = {}
    unique_f = {}
    Bot = bot.load_bot()

    for line in lines:
        #print(line)
        tweet_dict = json.loads(line)
        tweet = Tweet(tweet_dict)
        t_id1 = tweet['id_str']
        u_id1 = tweet['user']['id_str']
        tweet1 = tweet['text']
        screen_name = tweet['user']['screen_name']
        time1 = tweet['created_at']
        unique_u[u_id1] = 1

        if bot.check_bot(Bot, t_id1) == 1:
            continue
        #isretweeted
        try:
            #retweet = tweet['retweeted_status']
            retweet = tweet.get('retweeted_status', None)
            if retweet == None:
                retweet = tweet.get('quoted_status', None)

            if retweet == None:
                t[t_id1] = {
                    'user': u_id1,
                    'parent': u_id1,
                    'origin': u_id1,
                    'confirm': True,
                    'text': tweet1,
                    'origin_tweet': t_id1,
                    'parent_tweet': t_id1,
                    'tweet': t_id1,
                    'screen_name': screen_name,
                    'origin_name': screen_name,
                    'time': time1,
                    'depth': 1
                }
            else:
                tweet2 = retweet['text']
                t_id2 = retweet['id_str']
                u_id2 = retweet['user']['id_str']
                origin_name = retweet['user']['screen_name']
                time2 = retweet['created_at']
                t[t_id1] = {
                    'user': u_id1,
                    'parent': u_id2,
                    'origin': u_id2,
                    'confirm': False,
                    'text': tweet1,
                    'origin_tweet': t_id2,
                    'parent_tweet': t_id2,
                    'tweet': t_id1,
                    'screen_name': screen_name,
                    'origin_name': origin_name,
                    'time': time1,
                    'depth': 2
                }
                t[t_id2] = {
                    'user': u_id2,
                    'parent': u_id2,
                    'origin': u_id2,
                    'confirm': True,
                    'text': tweet2,
                    'origin_tweet': t_id2,
                    'parent_tweet': t_id2,
                    'tweet': t_id2,
                    'screen_name': origin_name,
                    'origin_name': origin_name,
                    'time': time2,
                    'depth': 1
                }
                unique_u[u_id2] = 1
        except KeyError as e:
            #no retweeted
            print("Key Error Exception!!!!")
            t[t_id1] = {
                'user': u_id1,
                'parent': u_id1,
                'origin': u_id1,
                'confirm': True,
                'text': tweet1,
                'origin_tweet': t_id1,
                'parent_tweet': t_id1,
                'tweet': t_id1,
                'screen_name': screen_name,
                'origin_name': screen_name,
                'time': time1,
                'depth': 1
            }
        #print(tweet.created_at_string, tweet.all_text)

    # if follower, origin_follwer, friends counts are same as unique users, then struct retweet networks
    # and the number of tweets are more than 100, else return None

    f_count = 0
    fr_count = 0
    for uid in unique_u.keys():
        user_path = '../Data/followers/followers/' + uid
        if os.path.exists(user_path):
            f_count += 1

    for uid in unique_u.keys():
        user_path = '../Data/friends/friends/' + uid
        if os.path.exists(user_path):
            fr_count += 1

    if len(t) <= 100:
        return 0, None
    print(path)
    print(
        'unique_users : %s , collected followers : %s, collected friends : %s'
        % (len(unique_u), f_count, fr_count))
    if f_count == len(unique_u) and fr_count == len(unique_u):
        print('%s : %s tweets' % (path, len(t)))
        return 1, t
    elif f_count == len(unique_u):
        print('%s : %s tweets' % (path, len(t)))
        return 2, t
    elif fr_count == len(unique_u):
        return 3, t
    else:
        return 0, t