Exemple #1
0
def fix_tweets_with_empty_flags():
    dbm = DBManager('tweets')
    script_parent_dir = pathlib.Path(__file__).parents[1]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1})
    for tweet in tweets_with_empty_flags:
        logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str']))
        flag, headers = create_flag(k_metadata)
        entities = get_entities_tweet(tweet['tweet_obj'])
        flag = add_values_to_flags(flag, entities, k_metadata)
        dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag)


#if __name__ == '__main__':
#    fix_tweets_with_empty_flags()
def compute_tweets_local_date(force_computation=False, include_hour=False):
    dbm = DBManager('tweets')
    if force_computation:
        query = {}
    else:
        query = {'tweet_py_datetime': {'$exists': 0}}
    s_objs = dbm.search(query, only_relevant_tws=False)
    for s_obj in s_objs:
        tweet = s_obj['tweet_obj']
        py_pub_dt = get_py_date(tweet)
        dict_to_update = {
            'tweet_py_datetime': datetime.strftime(py_pub_dt,
                                                   '%m/%d/%y %H:%M:%S'),
            'tweet_py_date': datetime.strftime(py_pub_dt, '%m/%d/%y')
        }
        if include_hour:
            dict_to_update.update(
                {'tweet_py_hour': datetime.strftime(py_pub_dt, '%H')})
        dbm.update_record({'tweet_obj.id_str': tweet['id_str']},
                          dict_to_update)
    return
Exemple #3
0
class NetworkAnalyzer:
    __dbm_tweets = None
    __dbm_users = None
    __dbm_networks = None
    __network = None
    __graph = None
    __nodes = set()
    __unknown_users = set()
    __node_sizes = None

    def __init__(self):
        self.__dbm_tweets = DBManager('tweets')
        self.__dbm_users = DBManager('users')
        self.__dbm_networks = DBManager('networks')
        self.__network = []

    def __computer_ff_ratio(self, friends, followers):
        if followers > 0 and friends > 0:
            return friends / followers
        else:
            return 0

    # Get interactions in of a given users
    def get_in_interactions(self, user_screen_name):
        # compute in interactions, meaning, interactions in which the user
        # was mentioned, retweeted, quoted, replied
        in_inter_query = {'interactions.' + user_screen_name: {'$exists': 1},
                          'screen_name': {'$ne': user_screen_name}}
        n_users = self.__dbm_users.search(in_inter_query)
        in_interactions_dict, in_rts, in_rps = {}, {}, {}
        in_qts, in_mts = {}, {}
        total_in_interactions = 0
        total_in_retweets, total_in_replies = 0, 0
        total_in_mentions, total_in_quotes = 0, 0
        for n_user in n_users:
            n_user_interactions = n_user['interactions']
            for i_user, interactions in n_user_interactions.items():
                if i_user == user_screen_name:
                    in_interactions_dict[n_user['screen_name']] = interactions['total']
                    total_in_interactions += interactions['total']
                    if 'retweets' in interactions.keys():
                        total_in_retweets += interactions['retweets']
                        in_rts[n_user['screen_name']] = interactions['retweets']
                    if 'replies' in interactions.keys():
                        total_in_replies += interactions['replies']
                        in_rps[n_user['screen_name']] = interactions['replies']
                    if 'mentions' in interactions.keys():
                        total_in_mentions += interactions['mentions']
                        in_mts[n_user['screen_name']] = interactions['mentions']
                    if 'quotes' in interactions.keys():
                        total_in_quotes += interactions['quotes']
                        in_qts[n_user['screen_name']] = interactions['quotes']
        in_interactions_obj = {
            'total': {
                'count': total_in_interactions,
                'details': in_interactions_dict
            },
            'replies': {
                'count': total_in_replies,
                'details': in_rps
            },
            'retweets': {
                'count': total_in_retweets,
                'details': in_rts
            },
            'mentions': {
                'count': total_in_mentions,
                'details': in_mts
            },
            'quotes': {
                'count': total_in_quotes,
                'details': in_qts
            }
        }
        user_dict = {
            'in_interactions': in_interactions_obj
        }
        return user_dict

    # Get interactions out of a given users
    def get_out_interactions(self, user_screen_name):
        user = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        # compute out interactions, meaning, interactions originated by
        # the user
        user_interactions = user['interactions']
        out_interactions_dict, out_rts = {}, {}
        out_rps, out_qts, out_mts = {}, {}, {}
        total_out_interactions, total_out_retweets = 0, 0
        total_out_mentions, total_out_replies = 0, 0
        total_out_quotes = 0
        for recipient, interactions in user_interactions.items():
            out_interactions_dict[recipient] = interactions['total']
            total_out_interactions += interactions['total']
            if 'retweets' in interactions:
                total_out_retweets += interactions['retweets']
                out_rts[recipient] = interactions['retweets']
            if 'replies' in interactions:
                total_out_replies += interactions['replies']
                out_rps[recipient] = interactions['replies']
            if 'mentions' in interactions:
                total_out_mentions += interactions['mentions']
                out_mts[recipient] = interactions['mentions']
            if 'quotes' in interactions:
                total_out_quotes += interactions['quotes']
                out_qts[recipient] = interactions['quotes']
        out_interactions_obj = {
            'total': {
                'count': total_out_interactions,
                'details': out_interactions_dict
            },
            'replies': {
                'count': total_out_replies,
                'details': out_rps
            },
            'retweets': {
                'count': total_out_retweets,
                'details': out_rts
            },
            'mentions': {
                'count': total_out_mentions,
                'details': out_mts
            },
            'quotes': {
                'count': total_out_quotes,
                'details': out_qts
            }
        }
        # compile all information in a dictionary
        user_dict = {
            'out_interactions': out_interactions_obj
        }
        return user_dict

    def create_users_db(self, clear_collection=False):
        logging.info('::. Network Analyzer: Creating database of users, it can take several minutes, please wait_')
        if clear_collection:
            self.__dbm_users.clear_collection()
        users = self.__dbm_tweets.get_unique_users()
        users_count = len(users)
        logging.info('::. Network Analyzer: Extracted {0} unique users from the database...'.format(users_count))
        progress = 1
        for user in users:
            db_user = {
                'screen_name': user['screen_name'],
                'friends': user['friends'],
                'followers': user['followers'],
                'ff_ratio': self.__computer_ff_ratio(user['friends'], user['followers']),
                'interactions': user['interactions'],
                'tweets': user['tweets_count'],
                'original_tweets': user['original_count'],
                'rts': user['retweets_count'],
                'qts': user['quotes_count'],
                'rps': user['replies_count'],
                'verified': user['verified']

            }
            # Assign the party and movement to the party and movement that are more related to the user
            # counting both Hashtags and Mentions by the user
            user_parties = self.__dbm_tweets.get_party_user(user['screen_name'])
            user_parties_count = len(user_parties) or 0
            logging.debug('::. Network Analyzer: User {0} has {1} associated parties...'
                          .format(user['screen_name'],user_parties_count))

            if user_parties_count > 0:
                user_party = user_parties[0]
                db_user.update({'most_interacted_party': user_party['partido']})
                user_movements = self.__dbm_tweets.get_movement_user(user['screen_name'])
                user_movements_count = len(user_movements) or 0
                logging.debug('::. Network Analyzer: User {0} has {1} associated movements...'
                              .format(user['screen_name'], user_movements_count))
                if user_movements_count > 0:
                    user_movement = user_movements[0]
                    db_user.update({'most_interacted_movement': user_movement['movimiento']})
                else:
                    db_user.update({'most_interacted_movement': ''})
            else:
                db_user.update({'most_interacted_party': '', 'movement': ''})


            # Assign the party and movement to the party and movement that are more related to the user
            # counting both Hashtags and Mentions by the user
            upp = UserPoliticalPreference()
            user_party = upp.get_user_political_party(user['screen_name'])
            user_movement = upp.get_user_political_movement(user['screen_name'])
            db_user.update({'party': user_party, 'movement': user_movement})

            filter_query = {'screen_name': user['screen_name']}
            logging.debug('::. Network Analyzer: Updating/creating user {0} ({1}/{2})...'
                          .format(user['screen_name'], progress, users_count))
            progress += 1
            self.__dbm_users.update_record(filter_query, db_user, create_if_doesnt_exist=True)

    def generate_network(self, subnet_query={}, depth=1, file_name='network', override_net=False):
        net_query = subnet_query.copy()
        net_query.update({'depth': depth})
        ret_net = self.__dbm_networks.search(net_query)
        # the net doesn't exist yet, let's create it
        if ret_net.count() == 0 or override_net:
            logging.info('Generating the network, it can take several minutes, please wait_')
            users = self.__dbm_users.search(subnet_query)
            # for each user generate his/her edges
            for user in users:
                if 'ff_ratio' in user.keys():
                    u_ff_ratio = user['ff_ratio']
                else:
                    u_ff_ratio = self.__computer_ff_ratio(user['friends'], user['followers'])
                pbb_score = user['bot_analysis']['pbb'] if 'bot_analysis' in user.keys() else ''
                self.__nodes.add(tuple({'screen_name': user['screen_name'], 'party': user['party'],
                                        'movement': user['movement'], 'ff_ratio': u_ff_ratio,
                                        'pbb': pbb_score}.items()))
                for interacted_user, interactions in user['interactions'].items():
                    iuser = self.__dbm_users.find_record({'screen_name': interacted_user})
                    if not iuser:
                        if depth > 1:
                            iuser_ffratio = self.__get_ffratio(interacted_user)
                            if not iuser_ffratio:
                                self.__unknown_users.add(interacted_user)
                                continue
                        else:
                            self.__unknown_users.add(interacted_user)
                            continue
                    else:
                        if 'ff_ratio' in iuser.keys():
                            i_ff_ratio = iuser['ff_ratio']
                        else:
                            i_ff_ratio = self.__computer_ff_ratio(iuser['friends'], iuser['followers'])

                    pbb_iuser_score = user['bot_analysis']['pbb'] if 'bot_analysis' in iuser.keys() else ''
                    self.__nodes.add(tuple({'screen_name': iuser['screen_name'], 'party': iuser['party'],
                                            'movement': iuser['movement'], 'ff_ratio': i_ff_ratio,

                                            'pbb': pbb_iuser_score}.items()))
                    edge = {
                        'nodeA': {'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio,
                                  'party': user['party'], 'movement': user['movement'],
                                  'pbb': pbb_score},
                        'nodeB': {'screen_name': interacted_user, 'ff_ratio': i_ff_ratio,
                                  'party': iuser['party'], 'movement': iuser['movement'],
                                  'pbb': pbb_iuser_score},
                        'weight': interactions['total']
                    }
                    self.__network.append(edge)
            logging.info('Created a network of {0} nodes and {1} edges'.format(len(self.__nodes), len(self.__network)))
            logging.info('Unknown users {0}'.format(len(self.__unknown_users)))
            # save the net in a gefx file for posterior usage
            f_name = self.save_network_in_gexf_format(file_name)
            logging.info('Saved the network in the file {0}'.format(f_name))
            db_net = {'file_name': str(f_name)}
            db_net.update(net_query)
            self.__dbm_networks.save_record(db_net)
        else:
            f_net = ret_net[0]
            logging.info('The network was already generated, please find it at {0}'.format(f_net['file_name']))

    def create_graph(self):
        logging.info('Creating the graph, please wait_')
        self.__graph = net.DiGraph()
        ff_ratio = defaultdict(lambda: 0.0)
        # create a directed graph from the edge data and populate a dictionary
        # with the friends/followers ratio
        for edge in self.__network:
            user = edge['nodeA']['screen_name']
            interacted_with = edge['nodeB']['screen_name']
            num_interactions = edge['weight']
            u_ff_ratio = edge['nodeA']['ff_ratio']
            self.__graph.add_edge(user, interacted_with, weight=int(num_interactions))
            ff_ratio[user] = float(u_ff_ratio)
        # obtain central node
        # degrees = net.degree(self.__graph)
        # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1]
        # center the graph around the central node
        # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node))
        return

    def get_graph_nodes(self):
        return len(self.__nodes)

    def get_graph_edges(self):
        return len(self.__network)

    def get_graph(self):
        return self.__graph

    def get_node_sizes(self):
        return self.__node_sizes

    def __get_ffratio(self, screen_name):
        query = {
            '$or': [
                {'tweet_obj.user.screen_name': screen_name},
                {'tweet_obj.retweeted_status.user.screen_name': screen_name},
                {'tweet_obj.quoted_status.user.screen_name': screen_name}
            ]
        }
        tweet_obj = self.__dbm_tweets.find_record(query)
        if tweet_obj:
            tweet = tweet_obj['tweet_obj']
            if 'retweeted_status' in tweet.keys():
                return self.__computer_ff_ratio(tweet['retweeted_status']['user']['friends_count'],
                                                tweet['retweeted_status']['user']['followers_count'])
            elif 'quoted_status' in tweet.keys():
                return self.__computer_ff_ratio(tweet['quoted_status']['user']['friends_count'],
                                                tweet['quoted_status']['user']['followers_count'])
            else:
                return self.__computer_ff_ratio(tweet['user']['friends_count'],
                                                tweet['user']['followers_count'])
        else:
            return None

    def save_network_in_gexf_format(self, file_name):
        today = datetime.strftime(datetime.now(), '%m/%d/%y')
        f_name = pathlib.Path(__file__).parents[2].joinpath('sna', 'gefx', file_name+'.gexf')
        with open(str(f_name), 'w', encoding='utf-8') as f:
            f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            f.write('<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" '
                    'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
                    'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" '
                    'version="1.2">\n')
            f.write('<meta lastmodifieddate="{0}">\n'.format(today))
            f.write('<creator>PoliticBots</creator>\n')
            f.write('<description>{0}</description>\n'.format(file_name))
            f.write('</meta>\n')
            f.write('<graph mode="static" defaultedgetype="directed">\n')
            # add data attributes
            f.write('<attributes class="node">\n')
            f.write('<attribute id="0" title="party" type="string"/>\n')
            f.write('<attribute id="1" title="movement" type="string"/>\n')
            f.write('<attribute id="2" title="ff_ratio" type="float"/>\n')
            f.write('<attribute id="3" title="pbb" type="float"/>\n')
            f.write('</attributes>\n')
            # add nodes
            f.write('<nodes>\n')
            node_id = 0
            list_nodes = []
            for node_tup in self.__nodes:
                node = dict(node_tup)
                f.write('<node id="{0}" label="{1}">\n'.format(node_id, node['screen_name']))
                f.write('<attvalues>\n')
                f.write('<attvalue for="0" value="{0}"/>\n'.format(node['party']))
                f.write('<attvalue for="1" value="{0}"/>\n'.format(node['movement']))
                f.write('<attvalue for="2" value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('<attvalue for="3" value="{0}"/>\n'.format(node['pbb']))
                f.write('</attvalues>\n')
                #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('</node>\n')
                node_id += 1
                list_nodes.append(node['screen_name'])
            f.write('</nodes>\n')
            # add edges
            f.write('<edges>\n')
            edge_id = 0
            for edge in list(self.__network):
                id_vertexA = list_nodes.index(edge['nodeA']['screen_name'])
                id_vertexB = list_nodes.index(edge['nodeB']['screen_name'])
                weight = edge['weight']
                f.write('<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'.format(edge_id, id_vertexA,
                                                                                           id_vertexB, weight))
                edge_id += 1
            f.write('</edges>\n')
            f.write('</graph>\n')
            f.write('</gexf>\n')
        return f_name
def add_video_property(use_video_config_api=False, user_bearer=None):
    db = DBManager('tweets')
    plain_tweets = db.get_plain_tweets()
    tot_plain_tweets = len(plain_tweets)
    logging.info('Plain tweets {0}'.format(tot_plain_tweets))
    tweet_counter = 0

    if not use_video_config_api:
        driver = webdriver.Chrome()

    for plain_tweet in plain_tweets:
        tweet_counter += 1
        response = None
        # video_config_api_response = None
        # if 'video_config_api' in plain_tweet.keys():
        #     video_config_api_response = plain_tweet['video_config_api']['is_video_response']
        if 'video_embed_url' in plain_tweet.keys():
            video_config_api_response = plain_tweet['video_config_api'][
                'is_video_response']

        logging.info('Remaining tweets: {0}'.format(tot_plain_tweets -
                                                    tweet_counter))
        id_tweet = plain_tweet['tweet_obj']['id_str']
        found_message = False
        method = "video_config_api"
        result_value = None
        result_status = None
        result_headers = None

        previous_responses = {}
        previous_responses['noexist'] = "Sorry, that page does not exist"
        previous_responses['empty'] = "b''"
        previous_responses['limit'] = "Rate limit exceeded"
        previous_responses['nomedia'] = "The media could not be played"

        # proceed = False
        # if video_config_api_response:
        #     if video_config_api_response.__contains__(previous_responses['noexist']) or video_config_api_response == previous_responses['empty'] or video_config_api_response.__contains__(previous_responses['limit']):
        #         logging.info('Processing tweet that got response: {0}'.format(video_config_api_response))
        #         proceed = True
        #
        # if not proceed:
        #     continue

        if not use_video_config_api:
            method = "video_embed_url"
            video_url = 'https://twitter.com/i/videos/'
            url = video_url + id_tweet
            driver.get(url)
            time.sleep(10)
            spans = driver.find_elements_by_tag_name('span')
            span_texts = [span.text for span in spans]
            result_value = str(span_texts)
            for span_text in span_texts:
                if span_text == 'The media could not be played.':
                    found_message = True
                    break
        else:
            import http.client
            response = get_video_config_with_user_bearer(user_bearer, id_tweet)
            curr_rate_limit_remaining_header = response.headers[
                'x-rate-limit-remaining']
            curr_rate_limit_remaining = 0
            if curr_rate_limit_remaining_header:
                curr_rate_limit_remaining = int(
                    curr_rate_limit_remaining_header)
            curr_time = calendar.timegm(time.gmtime())
            curr_rate_limit_expiration_header = response.headers[
                'x-rate-limit-reset']
            curr_rate_limit_expiration = curr_time
            if curr_rate_limit_expiration_header:
                curr_rate_limit_expiration = int(
                    curr_rate_limit_expiration_header)
            seconds_until_expiration = curr_rate_limit_expiration - curr_time

            result_value = str(response.read())
            result_headers = str(response.headers)
            result_status = str(response.status)

            if response.status != http.client.OK:
                found_message = True

            if curr_rate_limit_remaining == 0:
                logging.info(
                    '\n\nProcessed {0} tweets Twitter API rate limit exceeded. Waiting for {1} seconds'
                    .format(tweet_counter, seconds_until_expiration + 1))
                time.sleep(seconds_until_expiration + 1)

        update_object = {}
        if found_message:
            logging.info(
                '\n\nThe tweet {0} DOES NOT have a video! Response STATUS = \n{1}, HEADERS = \n{2}, \nBODY = {3} \n'
                .format(id_tweet, result_status, result_headers, result_value))
            update_object[method] = {
                'is_video': 0,
                'is_video_response': result_value
            }
            db.update_record({'tweet_obj.id_str': id_tweet}, update_object)
        else:
            logging.info(
                '\n\nThe tweet {0} HAS a video! Response STATUS = {1}, HEADERS = {2} \n'
                .format(id_tweet, result_status, result_headers))
            update_object[method] = {
                'is_video': 1,
                'is_video_response': result_value
            }
            db.update_record({'tweet_obj.id_str': id_tweet}, update_object)
class TweetEvaluator:
    special_chars = r'[=\+/&<>;:\'\"\?%$!¡\,\. \t\r\n]+'
    hashtags, user_handlers = [], []
    __dbm = None
    BATCH_SIZE = 1000

    def __init__(self):
        self.user_handlers, self.hashtags = get_user_handlers_and_hashtags()
        self.__dbm = DBManager('tweets')

    def __is_relevant(self, users_counter, hashtags_counter):
        # a tweet is considered relevant if fulfills one of two
        # conditions; candidates are mentioned or if candidates are
        # are not mentioned but there are at least more than one
        # campaign hashtag
        if users_counter > 0 or hashtags_counter > 1:
            return True
        else:
            return False

    def __assess_tweet_by_text(self, tweet_text):
        tweet_text = re.sub(u'\u2026', '',
                            tweet_text)  # remove ellipsis unicode char
        users_counter, hashtags_counter = 0, 0
        for token in tweet_text.split():
            token = re.sub(self.special_chars, '',
                           token)  # remove special chars
            if token.lower() in self.user_handlers:
                users_counter += 1
            if token.lower() in self.hashtags:
                hashtags_counter += 1
        return self.__is_relevant(users_counter, hashtags_counter)

    def __assess_tweet_by_entities(self, tweet_hashtags, tweet_mentions):
        users_counter, hashtags_counter = 0, 0
        for tweet_hashtag in tweet_hashtags:
            tweet_hashtag_txt = '#' + tweet_hashtag['text'].lower()
            if tweet_hashtag_txt in self.hashtags:
                hashtags_counter += 1
        for tweet_mention in tweet_mentions:
            screen_name = '@' + tweet_mention['screen_name'].lower()
            if screen_name in self.user_handlers:
                users_counter += 1
        return self.__is_relevant(users_counter, hashtags_counter)

    def is_tweet_relevant(self, tweet):
        tweet_author = tweet['user']['screen_name']
        tweet_handler = '@{0}'.format(tweet_author.lower())
        if tweet_handler in self.user_handlers:
            return True
        else:
            if 'retweeted_status' in tweet.keys():
                original_tweet = tweet['retweeted_status']
            else:
                original_tweet = tweet
            if 'entities' in original_tweet.keys():
                t_user_mentions = original_tweet['entities']['user_mentions']
                t_hashtags = original_tweet['entities']['hashtags']
                return self.__assess_tweet_by_entities(t_hashtags,
                                                       t_user_mentions)
            else:
                if 'full_text' in original_tweet.keys():
                    return self.__assess_tweet_by_text(tweet['full_text'])
                else:
                    return self.__assess_tweet_by_text(tweet['text'])

    def __mark_relevance_rt(self, tweet_reg):
        logging.info('Marking RTS...')
        query = {
            'tweet_obj.retweeted_status': {
                '$exists': 1
            },
            'tweet_obj.retweeted_status.id_str': {
                '$eq': tweet_reg['tweet_obj']['id_str']
            },
            'relevante': {
                '$ne': tweet_reg['relevante']
            }
        }
        update = {'$set': {'relevante': tweet_reg['relevante']}}
        update_res = self.__dbm.update_record_many(query, update)
        logging.info('Marked {0} RTS...'.format(update_res.matched_count))

    def identify_relevant_tweets(self):
        # select only original tweets that are not marked as relevant
        query = {
            'relevante': {
                '$exists': 0
            },
            'tweet_obj.retweeted_status': {
                '$exists': 0
            }
        }
        logging.info('Relevant Tweets: Running query to count...')
        # processing by batch as workaround cursor not found error
        total_tweets = self.__dbm.search(query,
                                         only_relevant_tws=False).count()
        total_batches = ceil(total_tweets / self.BATCH_SIZE)
        batch = 1
        moreToProcess = batch <= total_batches

        while moreToProcess:
            logging.info(
                'Querying records in batches of {0} records...'.format(
                    self.BATCH_SIZE))
            search_res = self.__dbm.search(
                query, only_relevant_tws=False).limit(self.BATCH_SIZE)
            logging.info('Loading batch {0}/{1} into memory...'.format(
                batch, total_batches))
            tweets = [doc for doc in search_res]
            total_tweets_batch = self.BATCH_SIZE
            if batch == total_batches:
                total_tweets_batch = len(tweets)
            logging.info(
                'Identifying relevant tweets in batch {0}/{1} out of {2} tweets...'
                .format(batch, total_batches, total_tweets_batch))
            tweet_counter = 0
            try:
                for tweet_reg in tweets:
                    tweet_counter += 1
                    tweet = tweet_reg['tweet_obj']
                    if self.is_tweet_relevant(tweet):
                        tweet_reg['relevante'] = 1
                        logging.info(
                            'Identifying {0}/{1} tweets (relevant)'.format(
                                tweet_counter, total_tweets))
                    else:
                        tweet_reg['relevante'] = 0
                        logging.info(
                            'Identifying {0}/{1} tweets (irrelevant)'.format(
                                tweet_counter, total_tweets))
                    self.__dbm.update_record(
                        {'tweet_obj.id_str': tweet['id_str']}, tweet_reg)
                    # copy the relevance flag to rts
                    self.__mark_relevance_rt(tweet_reg)

                logging.info(
                    'Finished identifying relevant tweets in batch {0}/{1} out of {2} tweets...'
                    .format(batch, total_batches, total_tweets_batch))
                batch += 1
                moreToProcess = batch <= total_batches
            except Exception as e:
                logging.info("Exception occurred...")
                logging.info("Exception message {0}".format(e))

        logging.info('Finished identifying relevant tweets...')
        return True

    # set to 'user' the type of tweets which keyword contains @
    def fix_tweet_type(self):
        query = {'type': 'hashtag', 'keyword': {'$regex': '@'}}
        objs = self.__dbm.search(query)
        num_fixed_tweets = objs.count()
        for obj in objs:
            obj['type'] = 'user'
            self.__dbm.save_record(obj)
        return num_fixed_tweets

    def __get_hashtags(self, hashtags_list):
        hts = []
        for ht in hashtags_list:
            hts.append(ht['text'])
        return hts

    def __get_screen_names(self, screen_names_list):
        scs = []
        for sc in screen_names_list:
            scs.append('@' + sc['screen_name'])
        return scs

    # fix value of candidatura if hashtags related to a candidacy
    # are present in the text of the tweet
    def fix_value_of_candidatura(self):
        script_parent_dir = pathlib.Path(__file__).parents[1]
        config_fn = script_parent_dir.joinpath('config.json')
        configuration = get_config(config_fn)
        keyword, k_metadata = parse_metadata(configuration['metadata'])
        interested_data = []
        # keep metadata that refer to candidacies
        for kword, kmetada in zip(keyword, k_metadata):
            if kmetada['candidatura'] != '':
                kmetada.update({'keyword': kword})
                interested_data.append(kmetada)
        query = {'candidatura': ''}
        # select tweets without candidacy
        s_objs = self.__dbm.search(query)
        num_fixed_tweets = 0
        # iterate over tweets without candidacy and fix those
        # whose text mention a candidate or have hashtags
        # related to a candidacy
        for s_obj in s_objs:
            party = s_obj['partido_politico']
            movement = s_obj['movimiento']
            tweet = s_obj['tweet_obj']
            relevant_data = []
            candidacy = ''
            # keep metadata related to the political party
            # (and movement) of the tweet (s_obj)
            for ida in interested_data:
                if ida['partido_politico'] == party:
                    if movement != '':
                        if ida['movimiento'] == movement:
                            relevant_data.append(ida)
                    else:
                        relevant_data.append(ida)
            if len(relevant_data) > 0:
                # extract relevant information of the tweet. hashtags and mentions if
                # the tweet obj has these entities otherwise the text of the tweet
                if 'retweeted_status' in tweet.keys():
                    original_tweet = tweet['retweeted_status']
                else:
                    original_tweet = tweet
                if 'entities' in original_tweet.keys():
                    t_user_mentions = self.__get_screen_names(
                        original_tweet['entities']['user_mentions'])
                    t_hashtags = self.__get_hashtags(
                        original_tweet['entities']['hashtags'])
                    # see if the interested keywords are part of the tweet hashtags or mentions
                    for rd in relevant_data:
                        if rd['keyword'] in t_user_mentions:
                            candidacy = rd['candidatura']
                            break
                        else:
                            if rd['keyword'] in t_hashtags:
                                candidacy = rd['candidatura']
                                break
                else:
                    if 'full_text' in original_tweet.keys():
                        t_text = tweet['full_text']
                    else:
                        t_text = tweet['text']
                    # see if the interested keywords are present in the text
                    for rd in relevant_data:
                        if rd['keyword'] in t_text:
                            candidacy = rd['candidatura']
                            break
                # fix candidacy key
                if candidacy:
                    s_obj['candidatura'] = candidacy
                    num_fixed_tweets += 1
                    self.__dbm.save_record(s_obj)
        return num_fixed_tweets
Exemple #6
0
class UserPoliticalPreference:
    db_tweets, db_users = None, None

    def __init__(self):
        self.db_tweets = DBManager('tweets')
        self.db_users = DBManager('users')
        self.hashtags, self.metadata = self.__get_hashtags_and_metadata()

    def __get_hashtags_and_metadata(self):
        script_parent_dir = pathlib.Path(__file__).parents[1]
        config_fn = script_parent_dir.joinpath('config.json')
        configuration = get_config(config_fn)
        keywords, metadata = parse_metadata(configuration['metadata'])
        hashtags = []
        for keyword in keywords:
            if '@' not in keyword:
                # The following hashtags are excluded because they are proper names of
                # movements and people
                if keyword not in ['HonorColorado', 'ColoradoAñetete', 'tuma']:
                    hashtags.append(keyword.lower())
        return hashtags, metadata

    def __get_tweet_hashtags(self, tweet_obj):
        tweet_hashtags = []
        if 'entities' in tweet_obj.keys():
            for hashtag in tweet_obj['entities']['hashtags']:
                tweet_hashtags.append(hashtag['text'])
        return tweet_hashtags

    def __get_hashtag_metadata(self, hashtag):
        for metadata in self.metadata:
            if metadata['keyword'].lower() == hashtag.lower():
                return metadata

    def get_user_political_movement(self, user_screen_name):
        user_movement = None
        user_political_preference = defaultdict(int)
        filter = {
            'relevante': {
                '$eq': 1
            },
            'tweet_obj.user.screen_name': {
                '$eq': user_screen_name
            }
        }
        results = self.db_tweets.search(filter)
        for tweet in results:
            tweet_obj = tweet['tweet_obj']
            if 'retweeted_status' in tweet_obj.keys():
                tweet_hashtags = self.__get_tweet_hashtags(
                    tweet_obj['retweeted_status'])
            else:
                tweet_hashtags = self.__get_tweet_hashtags(tweet_obj)
            for hashtag in tweet_hashtags:
                if hashtag.lower() in self.hashtags:
                    hashtag_metadata = self.__get_hashtag_metadata(hashtag)
                    if hashtag_metadata['movimiento']:
                        user_political_preference[
                            hashtag_metadata['movimiento']] += 1
        if user_political_preference:
            s_user_political_preference = [
                k for k in sorted(user_political_preference.items(),
                                  key=lambda k_v: k_v[1],
                                  reverse=True)
            ]
            user_movement = s_user_political_preference[0][0]
        return user_movement

    def get_user_political_party(self, user_screen_name):
        user_party = None
        user_political_preference = defaultdict(int)
        filter = {
            'relevante': {
                '$eq': 1
            },
            'tweet_obj.user.screen_name': {
                '$eq': user_screen_name
            }
        }
        results = self.db_tweets.search(filter)
        for tweet in results:
            tweet_obj = tweet['tweet_obj']
            if 'retweeted_status' in tweet_obj.keys():
                tweet_hashtags = self.__get_tweet_hashtags(
                    tweet_obj['retweeted_status'])
            else:
                tweet_hashtags = self.__get_tweet_hashtags(tweet_obj)
            for hashtag in tweet_hashtags:
                if hashtag.lower() in self.hashtags:
                    hashtag_metadata = self.__get_hashtag_metadata(hashtag)
                    if hashtag_metadata['partido_politico']:
                        user_political_preference[
                            hashtag_metadata['partido_politico']] += 1
        if user_political_preference:
            s_user_political_preference = [
                k for k in sorted(user_political_preference.items(),
                                  key=lambda k_v: k_v[1],
                                  reverse=True)
            ]
            user_party = s_user_political_preference[0][0]
        return user_party

    def update_users_political_preference(self, include_movement=True):
        users = self.db_users.search({})
        total_users = users.count()
        users_counter = 0
        for user in users:
            users_counter += 1
            user_movement, user_party = None, None
            logging.info('Processing {0}/{1} users'.format(
                users_counter, total_users))
            if include_movement:
                user_movement = self.get_user_political_movement(
                    user['screen_name'])
            user_party = self.get_user_political_party(user['screen_name'])
            logging.info('User {0} demonstrates to support {1}, {2}'.format(
                user['screen_name'], user_party, user_movement))
            self.db_users.update_record({'screen_name': user['screen_name']}, {
                'party': user_party,
                'movement': user_movement
            })

    def update_user_most_interacted_party_movement(self,
                                                   include_movement=True):
        users = self.db_users.search({})
        total_users = users.count()
        users_counter = 0
        for user in users:
            users_counter += 1
            user_most_interacted_movement, user_most_interacted_party = None, None
            logging.info('Processing {0}/{1} users'.format(
                users_counter, total_users))
            if include_movement:
                user_interacted_movements = self.db_tweets.get_movement_user(
                    user['screen_name'])
                if len(user_interacted_movements) > 0:
                    user_most_interacted_movement = user_interacted_movements[
                        0]['movimiento']
            user_interacted_parties = self.db_tweets.get_party_user(
                user['screen_name'])
            if len(user_interacted_parties) > 0:
                user_most_interacted_party = user_interacted_parties[0][
                    'partido']
            self.db_users.update_record(
                {'screen_name': user['screen_name']}, {
                    'most_interacted_party': user_most_interacted_party,
                    'most_interacted_movement': user_most_interacted_movement
                })

    def update_tweet_user_political_preference(self, include_movement=True):
        tweets = self.db_tweets.search({})
        tweet_authors = defaultdict(dict)
        total_tweets = tweets.count()
        tweet_counter = 0
        for tweet in tweets:
            tweet_counter += 1
            logging.info('Processing {0}/{1} tweets'.format(
                tweet_counter, total_tweets))
            tweet_obj = tweet['tweet_obj']
            new_fields = {}
            if tweet_obj['user']['screen_name'] not in tweet_authors.keys():
                user = self.db_users.search(
                    {'screen_name': tweet_obj['user']['screen_name']})
                try:
                    new_fields['author_party'] = user[0]['party']
                except IndexError:
                    new_fields['author_party'] = None
                if include_movement:
                    try:
                        new_fields.update(
                            {'author_movement': user[0]['movement']})
                    except IndexError:
                        new_fields.update({'author_movement': None})
                tweet_authors[tweet_obj['user']['screen_name']] = new_fields
            else:
                new_fields = tweet_authors[tweet_obj['user']['screen_name']]
            self.db_tweets.update_record(
                {'tweet_obj.id_str': tweet_obj['id_str']}, new_fields)

    def update_tweet_user_pbb(self):
        tweets = self.db_tweets.search({})
        tweet_authors = defaultdict(dict)
        total_tweets = tweets.count()
        tweet_counter = 0
        for tweet in tweets:
            tweet_counter += 1
            logging.info('Processing {0}/{1} tweets'.format(
                tweet_counter, total_tweets))
            tweet_obj = tweet['tweet_obj']
            new_fields = {}
            if tweet_obj['user']['screen_name'] not in tweet_authors.keys():
                user = self.db_users.search(
                    {'screen_name': tweet_obj['user']['screen_name']})
                try:
                    new_fields['author_pbb'] = user[0]['bot_analysis']['pbb']
                except IndexError:
                    new_fields['author_party'] = -1
                tweet_authors[tweet_obj['user']['screen_name']] = new_fields
            else:
                new_fields = tweet_authors[tweet_obj['user']['screen_name']]
            self.db_tweets.update_record(
                {'tweet_obj.id_str': tweet_obj['id_str']}, new_fields)
Exemple #7
0
class LinkAnalyzer:
    tweets_with_links = None
    db_tweets = None
    accepted_codes = [200, 201, 202]

    def __init__(self):
        self.db_tweets = DBManager('tweets')

    def get_domains_and_freq(self, save_to_file=False, **kwargs):
        self.tweets_with_links = self.db_tweets.get_tweets_with_links(**kwargs)
        total_tweets = len(self.tweets_with_links)
        domains_url = defaultdict(list)
        domains = defaultdict(int)
        logging.info(
            'Extracting the links of {0} tweets...'.format(total_tweets))
        tweet_counter = 0
        for tweet_obj in self.tweets_with_links:
            tweet = tweet_obj['tweet_obj']
            tweet_counter += 1
            logging.info('Tweet {0} out of {1}'.format(tweet_counter,
                                                       total_tweets))
            curret_tweet_domains = set()
            if 'entities' in tweet:
                for url in tweet['entities']['urls']:
                    tweet_url = url['expanded_url']
                    logging.info('Analyzing the url {0}'.format(tweet_url))
                    url_obj = tldextract.extract(tweet_url)
                    domain_name = url_obj.domain
                    # overwrite the domain name if some known abbreviations are found
                    if domain_name == 'fb':
                        domain_name = 'facebook'
                    if domain_name == 'youtu':
                        domain_name = 'youtube'
                    if domain_name in domains_url.keys():
                        domains_url[domain_name].append(tweet_url)
                        domains[domain_name] += 1
                        curret_tweet_domains.add(domain_name)
                        continue
                    try:
                        resp = requests.get(tweet_url)
                        if resp.status_code in self.accepted_codes:
                            url_obj = tldextract.extract(resp.url)
                        else:
                            url_obj = tldextract.extract(tweet_url)
                    except:
                        url_obj = tldextract.extract(tweet_url)
                    domain_name = url_obj.domain
                    domains_url[domain_name].append(tweet_url)
                    domains[domain_name] += 1
                    curret_tweet_domains.add(domain_name)
                self.db_tweets.update_record(
                    {'tweet_obj.id_str': tweet['id_str']},
                    {'domains': list(curret_tweet_domains)})
            else:
                logging.info('Tweet without entities {0}'.format(tweet))
        if save_to_file:
            # Save results into a json file
            file_name = pathlib.Path(__file__).parents[2].joinpath(
                'reports', 'tweet_domains.json')
            with open(file_name, 'w') as fp:
                json.dump(domains_url, fp, indent=4)
        return domains_url, sorted(domains.items(),
                                   key=lambda k_v: k_v[1],
                                   reverse=True)
Exemple #8
0
class SentimentAnalysis:
    config_file_name = pathlib.Path(__file__).parents[1].joinpath(
        'config.json')
    config = None
    language = ''
    method = ''
    __db = None

    def __init__(self, collection='tweets', language='spanish'):
        self.config = get_config(self.config_file_name)
        self.language = language
        self.__dbm = DBManager(collection)

    def __get_analyzed_tweet(self, analyzed_tweets, id_tweet_to_search):
        for analyzed_tweet in analyzed_tweets:
            if id_tweet_to_search == analyzed_tweet['id']:
                return analyzed_tweet
        return None

    def update_sentiment_of_non_original_tweets(self,
                                                query={},
                                                update_sentiment=False):
        if update_sentiment:
            query.update({
                'relevante': 1,
            })
        else:
            query.update({'relevante': 1, 'sentimiento': {'$exists': 0}})
        tweet_regs = self.__dbm.search(query)
        rts_wo_tw = []
        for tweet_reg in tweet_regs:
            if 'retweeted_status' in tweet_reg['tweet_obj'].keys():
                id_original_tweet = tweet_reg['tweet_obj']['retweeted_status'][
                    'id_str']
                original_tweet_reg = self.__dbm.find_record(
                    {'tweet_obj.id_str': id_original_tweet})
                if original_tweet_reg:
                    sentiment_ot = original_tweet_reg['sentimiento']
                    if sentiment_ot:
                        self.__dbm.update_record(
                            {
                                'tweet_obj.id_str':
                                tweet_reg['tweet_obj']['id_str']
                            }, {'sentimiento': sentiment_ot})
                    else:
                        raise Exception(
                            'Error, found an original tweet without sentiment')
                else:
                    rts_wo_tw.append(tweet_reg['tweet_obj'])
            elif tweet_reg['tweet_obj']['in_reply_to_status_id_str']:
                rts_wo_tw.append(tweet_reg['tweet_obj'])
                logging.info('Tweet not RT {0}'.format(
                    tweet_reg['tweet_obj']['id_str']))
        self.__analyze_sentiment_of_rt_wo_tws(rts_wo_tw)

    def __update_sentimient_rts(self, analyzed_tweets):
        for analyzed_tweet in analyzed_tweets:
            # search rts of the analyzed tweet
            rts = self.__dbm.search(
                {'tweet_obj.retweeted_status.id_str': analyzed_tweet['id']})
            for rt in rts:
                self.__dbm.update_record(
                    {'tweet_obj.id_str': rt['tweet_obj']['id_str']},
                    {'sentimiento': analyzed_tweet['sentimiento']})

    def __analyze_sentiment_of_rt_wo_tws(self, tweets):
        tot_tws = len(tweets)
        batch_size = 5
        tweets_to_analyze = []
        for current_tw in range(tot_tws):
            tweet_id = tweets[current_tw]['id_str']
            if 'retweeted_status' in tweets[current_tw].keys():
                tweet = tweets[current_tw]['retweeted_status']
            else:
                tweet = tweets[current_tw]
            if 'full_text' in tweet.keys():
                tweet_text = tweet['full_text']
            else:
                tweet_text = tweet['text']
            if len(tweets_to_analyze) < batch_size and current_tw < tot_tws:
                tweets_to_analyze.append({'id': tweet_id, 'text': tweet_text})
                if len(tweets_to_analyze) < batch_size and current_tw < (
                        tot_tws - 1):
                    continue
            sentiment_results = self.do_sentiment_analysis(tweets_to_analyze)
            tweets_to_analyze = []
            for sentiment_result in sentiment_results:
                sentiment_info = sentiment_result['sentimiento']
                tweet_id = sentiment_result['id']
                tweet_text = sentiment_result['text']
                self.__dbm.update_record({'tweet_obj.id_str': tweet_id},
                                         {'sentimiento': sentiment_info})
                logging.debug('Tweet text: {0}, Sentimiento: {1} ({2})'.format(
                    tweet_text.encode('utf-8'), sentiment_info['tono'],
                    sentiment_info['score']))

    def analyze_sentiments(self, query={}, update_sentiment=False):
        """
        :param query: dictionary of <key, value> terms to be used in querying the db
        """
        if update_sentiment:
            query.update({
                'relevante': 1,
                'tweet_obj.retweeted_status': {
                    '$exists': 0
                }
            })
        else:
            query.update({
                'relevante': 1,
                'tweet_obj.retweeted_status': {
                    '$exists': 0
                },
                'sentimiento': {
                    '$exists': 0
                }
            })
        tweet_regs = self.__dbm.search(query)
        analyzed_tweets = []
        tot_reg = tweet_regs.count()
        logging.info(
            'Going to analyze the sentiment of {0} tweets, '
            'it can take a lot of time, be patient...'.format(tot_reg))
        batch_size = 100
        total_batches = ceil(tot_reg / batch_size)
        batch = 0
        tweets_to_analyze = []
        try:
            for current_reg in range(tot_reg):
                tweet_reg = tweet_regs[current_reg]
                tweet = tweet_reg['tweet_obj']
                if 'full_text' in tweet.keys():
                    tweet_text = tweet['full_text']
                else:
                    tweet_text = tweet['text']
                if len(tweets_to_analyze
                       ) < batch_size and current_reg < tot_reg:
                    tweets_to_analyze.append({
                        'id': tweet['id_str'],
                        'text': tweet_text
                    })
                    if len(tweets_to_analyze) < batch_size:
                        continue
                batch += 1
                logging.info(
                    'Analyzing the sentiment of {0} tweets in batch {1}/{2} '
                    'out of {3} tweets...'.format(len(tweets_to_analyze),
                                                  batch, total_batches,
                                                  tot_reg))
                sentiment_results = self.do_sentiment_analysis(
                    tweets_to_analyze)
                logging.info(
                    'Finished analyzing the sentiment of {0} tweets in batch {1}/{2} '
                    'out of {3} tweets...'.format(len(tweets_to_analyze),
                                                  batch, total_batches,
                                                  tot_reg))
                logging.info('Updating sentiment scores in database...')
                tweets_to_analyze = []
                for sentiment_result in sentiment_results:
                    sentiment_info = sentiment_result['sentimiento']
                    tweet_id = sentiment_result['id']
                    tweet_text = sentiment_result['text']
                    self.__dbm.update_record({'tweet_obj.id_str': tweet_id},
                                             {'sentimiento': sentiment_info})
                    analyzed_tweets.append({
                        'id': tweet_id,
                        'texto': tweet_text,
                        'sentimiento': sentiment_info
                    })
                    logging.debug(
                        'Tweet text: {0}, Sentimiento: {1} ({2})'.format(
                            tweet_text.encode('utf-8'), sentiment_info['tono'],
                            sentiment_info['score']))
        except Exception as e:
            logging.error(e)
        finally:
            self.__update_sentimient_rts(analyzed_tweets)

        return analyzed_tweets

    def do_sentiment_analysis(self, tweets):
        sa = SentimentAnalyzer(language='spanish')
        tweet_texts = []
        for tweet in tweets:
            tweet_texts.append(tweet['text'] +
                               ' -$%#$&- {0}'.format(tweet['id']))
        sa.analyze_docs(tweet_texts)
        results = sa.tagged_docs
        logging.info(
            'Finished the sentiment analysis, now {0} results are going to '
            'be processed...'.format(len(results)))
        ret = self.__process_results(results)
        logging.info('Computed correctly the sentiment of {0} tweets'.format(
            len(tweet_texts)))
        return ret

    def remote_sentiment_analysis(self, tweets):
        accepted_codes = [200, 201, 202]
        error_codes = [400, 401]
        url_base = 'http://159.203.77.35:8080/api'
        url_sentiment = url_base + '/analysis/sentiment-analysis/'
        url_auth = url_base + '/auth/'
        headers = {'Authorization': 'JWT ' + self.config['inhouse']['api_key']}
        tweet_texts = []
        for tweet in tweets:
            tweet_texts.append(tweet['text'] +
                               ' -$%#$&- {0}'.format(tweet['id']))
        parameters = {
            'neu_inf_lim': -0.3,
            'neu_sup_lim': 0.3,
            'language': 'spanish'
        }
        data = {
            'name': (None, 'politic-bots'),
            'parameters': (None, json.dumps(parameters), 'application/json'),
            'data_object': (None, json.dumps(tweet_texts), 'application/json')
        }
        ret = []
        logging.info('Computing the sentiment of {0} tweets'.format(
            len(tweet_texts)))
        resp = requests.post(url_sentiment, headers=headers, files=data)
        if resp.status_code in error_codes:
            # have to renew the api token
            body_auth = {
                'username': self.config['inhouse']['username'],
                'password': self.config['inhouse']['password']
            }
            resp = requests.post(url_auth, data=body_auth)
            if resp.status_code in accepted_codes:
                resp_json = resp.json()
                api_token = resp_json['token']
                self.config['inhouse']['api_key'] = api_token
                update_config(self.config_file_name, self.config)
                resp = requests.post(url_sentiment,
                                     headers=headers,
                                     files=data)
            else:
                raise Exception(
                    'Error {0} when trying to renew the token of the api'.
                    format(resp.status_code))
        if resp.status_code in accepted_codes:
            resp_json = resp.json()
            get_url = url_sentiment + str(resp_json['id']) + '/'
            results = []
            # wait some time before trying to get
            # the results
            time.sleep(60)
            while len(results) == 0:
                # wait some time before trying to
                # get the results
                time.sleep(30)
                resp = requests.get(get_url, headers=headers)
                if resp.status_code in accepted_codes:
                    resp_json = resp.json()
                    results = json.loads(resp_json['result'])
                else:
                    raise Exception(
                        'Got an unexpected response, code: {0}'.format(
                            resp.status_code))
            logging.info(
                'Obtained the results of sentiment analysis, now the results are going to be processed...'
            )
            ret = self.__process_results(results)
        else:
            logging.error(
                'Error {0} when trying to compute the sentiment of the tweets'.
                format(resp.status_code))
        logging.info('Computed correctly the sentiment of {0} tweets'.format(
            len(tweet_texts)))
        return ret

    def __process_results(self, results):
        ret = []
        for result in results:
            text, tone, score = result
            if tone == 'neg':
                sentiment = 'negative'
            elif tone == 'pos':
                sentiment = 'positive'
            else:
                sentiment = 'neutral'
            tw_text_id = text.split('-$%#$&-')
            id_tweet = tw_text_id[1].strip()
            text_tweet = tw_text_id[0].strip()
            dic_ret = {
                'id': id_tweet,
                'text': text_tweet,
                'sentimiento': {
                    'tono': sentiment,
                    'score': score
                }
            }
            ret.append(dic_ret)
        return ret
Exemple #9
0
class BotDetector:
    __dbm_tweets = None
    __dbm_users = None
    __api = None

    def __init__(self):
        self.__dbm_tweets = DBManager('tweets')
        self.__dbm_users = DBManager('users')
        name_config_file = pathlib.Path(__file__).parents[1].joinpath('config.json')
        conf = get_config(name_config_file)
        auth = tweepy.AppAuthHandler(conf['twitter']['consumer_key'], conf['twitter']['consumer_secret'])
        self.__api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    def __save_user_pbb(self, user_screen_name, pbb, bot_score, user_bot_features, num_heuristics,
                        sum_weights, exist_user):
        new_fields = {
            'exists': int(exist_user),
            'bot_analysis': {'features': user_bot_features,
                             'pbb': pbb,
                             'raw_score': bot_score,
                             'num_evaluated_heuristics': num_heuristics,
                             'sum_weights': sum_weights}
        }
        self.__dbm_users.update_record({'screen_name': user_screen_name}, new_fields)

    def __check_if_user_exists(self, user_screen_name):
        user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        if 'exists' in user_obj.keys():
            return int(user_obj['exists'])
        else:
            try:
                self.__api.get_user(user_screen_name)
                return True
            except tweepy.TweepError:
                return False

    def __compute_bot_formula(self, user_bot_features, exists_user):
        name_weights_file = pathlib.Path(__file__).parents[0].joinpath('heuristic_weights.json')
        weights_file = get_config(name_weights_file)
        sum_heuristic_values = 0
        sum_weights = 0
        for feature_name in user_bot_features.keys():
            feature_weight = weights_file[feature_name]
            feature_value = user_bot_features[feature_name]['value']
            sum_heuristic_values += feature_weight * feature_value
            sum_weights += feature_weight
        sum_heuristic_values += weights_file['exists'] * (1-int(exists_user))
        sum_weights += weights_file['exists']
        return sum_heuristic_values, sum_weights, sum_heuristic_values/sum_weights

    def __get_timeline(self, user_screen_name, user_tweets):
        """
        Get the last 100 tweets in the timeline of a given user
        :param user: user from whom her timeline should be obtained from
        :return: user's timeline
        """
        user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        if 'timeline' in user_obj.keys():
            return user_obj['timeline']
        logging.info('Get the last 100 tweets from Twitter')
        timeline = []
        try:
            for status in tweepy.Cursor(self.__api.user_timeline, screen_name=user_screen_name).items(100):
                timeline.append(status._json)
            # save the not electoral tweets of the user's timeline
            id_electoral_tweets = [tweet['id_str'] for tweet in user_tweets]
            timeline_tweets_to_save = [tweet for tweet in timeline
                                       if tweet['id_str'] not in id_electoral_tweets]
            logging.info('To save {0} not electoral tweets of {1}'.format(len(timeline_tweets_to_save),
                                                                          user_screen_name))
            new_field = {
                'timeline': timeline_tweets_to_save
            }
            self.__dbm_users.update_record({'screen_name': user_screen_name}, new_field)
        except tweepy.TweepError:
            pass
        return timeline

    def __get_tweets_user(self, user_screen_name):
        user_tweets_obj = self.__dbm_tweets.search({'tweet_obj.user.screen_name': user_screen_name})
        user_tweets = [user_tweet_obj['tweet_obj'] for user_tweet_obj in user_tweets_obj]
        return user_tweets

    def __get_user_info_from_twitter(self, user_screen_name):
        user_twitter_obj = None
        try:
            user_twitter_obj = self.__api.get_user(user_screen_name)
        except tweepy.TweepError:
            pass
        return user_twitter_obj._json

    def __get_computed_heuristics(self, user_screen_name):
        user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        if 'bot_analysis' in user_obj.keys():
            return user_obj['bot_analysis']['features']
        else:
            return {}

    def __compute_heuristics(self, user_screen_name, recompute_heuristics=False):
        logging.info('\n\nComputing the probability of being bot of the user: {0}\n\n'.format(user_screen_name))

        # Get tweets of the user
        user_tweets = self.__get_tweets_user(user_screen_name)

        # Check if the user still exists on Twitter
        exist_user = self.__check_if_user_exists(user_screen_name)
        user_timeline = None
        if exist_user:
            # If the user still exists on Twitter, get her timeline
            user_timeline = self.__get_timeline(user_screen_name, user_tweets)

        # Get the information about the user and her tweets
        user_obj = get_user(self.__dbm_tweets, user_screen_name)
        if not user_obj:
            user_obj = self.__get_user_info_from_twitter(user_screen_name)
            if not user_obj:
                raise Exception('Error!, Cannot fetch information about the user {0}'.format(user_screen_name))

        if user_obj['verified']:
            # It is a verified account, it cannot be bot
            logging.info('The user {0} is an account verified by Twitter, it cannot be a bot'.format(user_screen_name))
            self.__save_user_pbb(user_screen_name, 0, 0, None, 0, 0, exist_user)
            return

        # Get the computed heuristics
        user_bot_features = self.__get_computed_heuristics(user_screen_name)
        if user_bot_features:
            user_computed_heuristics = user_bot_features.keys()
        else:
            user_computed_heuristics = []

        if recompute_heuristics or 'retweet_electoral' not in user_computed_heuristics:
            if user_tweets:
                # Compute the percentage of retweets in the electoral tweets
                per_rt = is_retweet_bot(user_tweets)
                user_bot_features['retweet_electoral'] = {
                    'value': per_rt
                }

        if recompute_heuristics or 'reply_electoral' not in user_computed_heuristics:
            if user_tweets:
                # Compute the percentage of replies in the electoral tweets
                per_rp = reply_percentage(user_tweets)
                user_bot_features['reply_electoral'] = {
                    'value': per_rp
                }

        if recompute_heuristics or 'retweet_timeline' not in user_computed_heuristics:
            # Compute the percentage of retweets in the user's timeline
                if user_timeline:
                    per_rt = is_retweet_bot(user_timeline)
                    user_bot_features['retweet_timeline'] = {
                        'value': per_rt
                    }

        if recompute_heuristics or 'reply_timeline' not in user_computed_heuristics:
            if user_timeline:
                per_rp = reply_percentage(user_timeline)
                user_bot_features['reply_timeline'] = {
                    'value': per_rp
                }

        if recompute_heuristics or 'creation_date' not in user_computed_heuristics:
            # Check the user's creation year
            extraction_date = self.__dbm_tweets.find_record({})['extraction_date']
            electoral_year = int('20' + extraction_date.split('/')[2])
            user_bot_features['creation_date'] = {
                'value': creation_date(parse_date(user_obj['created_at']), electoral_year)
            }

        if recompute_heuristics or 'default_profile' not in user_computed_heuristics:
            # Check if the user has default profile.
            user_bot_features['default_profile'] = {
                'value': default_profile(user_obj)
            }

        if recompute_heuristics or 'default_profile_picture' not in user_computed_heuristics:
            # Check if the user has default profile picture
            user_bot_features['default_profile_picture'] = {
                'value': default_profile_picture(user_obj)
            }

        if recompute_heuristics or 'default_background' not in user_computed_heuristics:
            # Check if the user has default background picture
            user_bot_features['default_background'] = {
                'value': default_background(user_obj)
            }

        if recompute_heuristics or 'empty_description' not in user_computed_heuristics:
            # Check if the user has a biography description
            user_bot_features['empty_description'] = {
                'value': default_description(user_obj)
            }

        if recompute_heuristics or 'location' not in user_computed_heuristics:
            # Check if the user has location
            user_bot_features['location'] = {
                'value': location(user_obj)
            }

        if recompute_heuristics or 'ff_ratio' not in user_computed_heuristics:
            # Check the user's following followers ratio
            ratio = followers_ratio(user_obj)
            user_bot_features['ff_ratio'] = {
                'value': ratio
            }

        if recompute_heuristics or 'random_letters' not in user_computed_heuristics:
            rl_value = random_account_letter(user_obj)
            user_bot_features['random_letters'] = {
                'value': rl_value
            }

        if recompute_heuristics or 'random_numbers' not in user_computed_heuristics:
            rn_value = random_account_number(user_obj)
            user_bot_features['random_numbers'] = {
                'value': rn_value
            }

        if recompute_heuristics or 'similar_account' not in user_computed_heuristics:
            similarity_score = similar_account_name(user_obj, self.__dbm_users, self.__dbm_tweets)
            user_bot_features['similar_account'] = {
                'value': similarity_score
            }

        # Compute the user's probability of being bot
        num_computed_heuristics = len(user_bot_features.keys())
        bot_score, sum_weights, pbb = self.__compute_bot_formula(user_bot_features, exist_user)

        self.__save_user_pbb(user_screen_name, pbb, bot_score, user_bot_features,
                             num_computed_heuristics, sum_weights, exist_user)
        logging.info('\n\nThe bot score of {0} is {1}\n\n'.format(user_screen_name, bot_score))
        return

    def compute_fake_promoter_heuristic(self, users):
        name_weights_file = pathlib.Path(__file__).parents[0].joinpath('heuristic_weights.json')
        weights_file = get_config(name_weights_file)

        if not users:
            users = self.__dbm_users.search({'bot_analysis.features.fake_promoter': {'$exists': 0},
                                             'verified': {'$ne': True}})

        tot_user = users.count()
        idx_user = 1
        for user in users:
            logging.info('Remaining users: {0}'.format(tot_user - idx_user))
            user_screen_name = user['screen_name']
            user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
            user_bot_features = user_obj['bot_analysis']['features']
            # Check if the user interacts with bot accounts
            fp = fake_promoter(user_screen_name, self.__dbm_users)
            logging.info('User: {0}, fake promoter score: {1}'.format(user_screen_name, fp))

            if not user_bot_features:
                user_bot_features = {}

            user_bot_features['fake_promoter'] = {
                'value': fp
            }
            bot_score = user_obj['bot_analysis']['raw_score']
            bot_score += user_bot_features['fake_promoter']['value'] * weights_file['fake_promoter']
            heuristics = user_obj['bot_analysis']['num_evaluated_heuristics'] + 1
            sum_weights = user_obj['bot_analysis']['sum_weights'] + weights_file['fake_promoter']
            pbb = bot_score/sum_weights
            exist_user = user_obj['exists']
            self.__save_user_pbb(user_screen_name, pbb, bot_score, user_bot_features, heuristics, sum_weights,
                                 exist_user)
            idx_user += 1

    def compute_bot_probability(self, users, source_users_collection = "", source_users_db = ""):
        reusers_db = None
        if source_users_db and source_users_collection:
            reusers_db = DBManager(source_users_collection, source_users_db)

        if not users:
            # Get all users who don't have the analysis of bot in current user
            users = self.__dbm_users.search({'bot_analysis': {'$exists': 0}})

        tot_user = len(users) if type(users) == list else users.count()
        idx_user = 1
        for user in users:
            logging.info('Remaining users: {0}'.format(tot_user-idx_user))
            if reusers_db:
                reuser_cursor = reusers_db.search({'screen_name': user['screen_name']})

                if reuser_cursor.count() > 0:
                    logging.info('Reusing bot analysis from another DB for {0}'.format(user['screen_name']))
                    reuser = reuser_cursor[0]
                    bot_analysis = reuser['bot_analysis']
                    self.__save_user_pbb(reuser['screen_name'], bot_analysis['pbb'], bot_analysis['raw_score'],
                                         bot_analysis['features'], bot_analysis['num_evaluated_heuristics'],
                                         bot_analysis['sum_weights'], reuser['exists'])
                    continue

            if type(users) == list:
                user_screen_name = user
            else:
                user_screen_name = user['screen_name']
            self.__compute_heuristics(user_screen_name)
            idx_user += 1

    def to_csv(self, output_file_name, include_verified_accounts=True):
        if not include_verified_accounts:
            query = {'bot_analysis': {'$exists': 1}, 'verified': {'$ne': True}}
        else:
            query = {'bot_analysis': {'$exists': 1}}
        users = self.__dbm_users.search(query)
        f_name = str(pathlib.Path(__file__).parents[2].joinpath('reports',output_file_name))
        logging.info('Saving bot analysis into the csv file {0}'.format(f_name))
        with open(f_name, 'w', encoding='utf-8') as f:
            user_info_fields = ['screen_name', 'profile_url', 'party', 'movement', 'exists', 'followers',
                                'friends', 'tweets', 'rts', 'rps', 'verified']
            bot_analysis_fields = ['location', 'default_profile_picture', 'retweet_electoral',
                                   'default_background', 'similar_account', 'random_numbers', 'ff_ratio',
                                   'random_letters', 'default_profile', 'creation_date', 'empty_description',
                                   'retweet_timeline', 'reply_electoral', 'reply_timeline', 'fake_promoter',
                                   'raw_score', 'sum_weights', 'pbb']
            writer = csv.DictWriter(f, fieldnames=user_info_fields+bot_analysis_fields)
            writer.writeheader()
            tot_users = users.count()
            logging.info('Going to save the information of the bot analysis of {0} users'.format(tot_users))
            idx_user = 1
            for user in users:
                logging.info('Remaining users: {0}'.format(tot_users - idx_user))
                row_dict = {}
                for field_name in bot_analysis_fields:
                    if field_name in user['bot_analysis']['features'].keys():
                        row_dict[field_name] = user['bot_analysis']['features'][field_name]['value']
                    elif field_name in user['bot_analysis'].keys():
                        row_dict[field_name] = user['bot_analysis'][field_name]
                for field_name in user_info_fields:
                    if field_name == 'profile_url':
                        continue
                    row_dict[field_name] = user[field_name]
                if user['exists']:
                    row_dict['profile_url'] = 'https://twitter.com/' + user['screen_name']
                else:
                    row_dict['profile_url'] = ' '
                writer.writerow(row_dict)
                idx_user += 1
        logging.info('The saving process has finished, please check the file {0}'.format(f_name))