Ejemplo n.º 1
0
def __db_trustworthy_users(db_users, db_tweets, config):
    """
    Generate a database of trustworthy users. We trust in the user if she
    has a verified account or has more than X number of followers
    
    :param db_users: database of user
    :param config: dictionary with the configuration parameters of the heuristic

    :return: database of trustworthy users
    """
    trustworthy_users_db = DBManager('trustworthy_users')
    if trustworthy_users_db.num_records_collection() == 0:
        logging.info('The trustworthy_users collection is being created...')
        for doc in db_users.find_all():
            data = get_user(db_tweets, doc['screen_name'])
            if data['verified'] or int(
                    data['followers_count']) > config['min_num_followers']:
                if not trustworthy_users_db.find_record(
                    {'screen_name': data['screen_name']}):
                    trustworthy_users_db.save_record({
                        'screen_name':
                        doc['screen_name'],
                        'name':
                        data['name'],
                        'created_at':
                        data['created_at'],
                        'followers_count':
                        data['followers_count'],
                        'verified':
                        data['verified']
                    })
    return trustworthy_users_db
Ejemplo n.º 2
0
class NetworkAnalyzer:
    __dbm_tweets = None
    __dbm_users = None
    __dbm_networks = None
    __network = None
    __graph = None
    __nodes = set()
    __unknown_users = set()
    __node_sizes = None

    def __init__(self):
        self.__dbm_tweets = DBManager('tweets')
        self.__dbm_users = DBManager('users')
        self.__dbm_networks = DBManager('networks')
        self.__network = []

    def __computer_ff_ratio(self, friends, followers):
        if followers > 0 and friends > 0:
            return friends / followers
        else:
            return 0

    # Get interactions in of a given users
    def get_in_interactions(self, user_screen_name):
        # compute in interactions, meaning, interactions in which the user
        # was mentioned, retweeted, quoted, replied
        in_inter_query = {'interactions.' + user_screen_name: {'$exists': 1},
                          'screen_name': {'$ne': user_screen_name}}
        n_users = self.__dbm_users.search(in_inter_query)
        in_interactions_dict, in_rts, in_rps = {}, {}, {}
        in_qts, in_mts = {}, {}
        total_in_interactions = 0
        total_in_retweets, total_in_replies = 0, 0
        total_in_mentions, total_in_quotes = 0, 0
        for n_user in n_users:
            n_user_interactions = n_user['interactions']
            for i_user, interactions in n_user_interactions.items():
                if i_user == user_screen_name:
                    in_interactions_dict[n_user['screen_name']] = interactions['total']
                    total_in_interactions += interactions['total']
                    if 'retweets' in interactions.keys():
                        total_in_retweets += interactions['retweets']
                        in_rts[n_user['screen_name']] = interactions['retweets']
                    if 'replies' in interactions.keys():
                        total_in_replies += interactions['replies']
                        in_rps[n_user['screen_name']] = interactions['replies']
                    if 'mentions' in interactions.keys():
                        total_in_mentions += interactions['mentions']
                        in_mts[n_user['screen_name']] = interactions['mentions']
                    if 'quotes' in interactions.keys():
                        total_in_quotes += interactions['quotes']
                        in_qts[n_user['screen_name']] = interactions['quotes']
        in_interactions_obj = {
            'total': {
                'count': total_in_interactions,
                'details': in_interactions_dict
            },
            'replies': {
                'count': total_in_replies,
                'details': in_rps
            },
            'retweets': {
                'count': total_in_retweets,
                'details': in_rts
            },
            'mentions': {
                'count': total_in_mentions,
                'details': in_mts
            },
            'quotes': {
                'count': total_in_quotes,
                'details': in_qts
            }
        }
        user_dict = {
            'in_interactions': in_interactions_obj
        }
        return user_dict

    # Get interactions out of a given users
    def get_out_interactions(self, user_screen_name):
        user = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        # compute out interactions, meaning, interactions originated by
        # the user
        user_interactions = user['interactions']
        out_interactions_dict, out_rts = {}, {}
        out_rps, out_qts, out_mts = {}, {}, {}
        total_out_interactions, total_out_retweets = 0, 0
        total_out_mentions, total_out_replies = 0, 0
        total_out_quotes = 0
        for recipient, interactions in user_interactions.items():
            out_interactions_dict[recipient] = interactions['total']
            total_out_interactions += interactions['total']
            if 'retweets' in interactions:
                total_out_retweets += interactions['retweets']
                out_rts[recipient] = interactions['retweets']
            if 'replies' in interactions:
                total_out_replies += interactions['replies']
                out_rps[recipient] = interactions['replies']
            if 'mentions' in interactions:
                total_out_mentions += interactions['mentions']
                out_mts[recipient] = interactions['mentions']
            if 'quotes' in interactions:
                total_out_quotes += interactions['quotes']
                out_qts[recipient] = interactions['quotes']
        out_interactions_obj = {
            'total': {
                'count': total_out_interactions,
                'details': out_interactions_dict
            },
            'replies': {
                'count': total_out_replies,
                'details': out_rps
            },
            'retweets': {
                'count': total_out_retweets,
                'details': out_rts
            },
            'mentions': {
                'count': total_out_mentions,
                'details': out_mts
            },
            'quotes': {
                'count': total_out_quotes,
                'details': out_qts
            }
        }
        # compile all information in a dictionary
        user_dict = {
            'out_interactions': out_interactions_obj
        }
        return user_dict

    def create_users_db(self, clear_collection=False):
        logging.info('::. Network Analyzer: Creating database of users, it can take several minutes, please wait_')
        if clear_collection:
            self.__dbm_users.clear_collection()
        users = self.__dbm_tweets.get_unique_users()
        users_count = len(users)
        logging.info('::. Network Analyzer: Extracted {0} unique users from the database...'.format(users_count))
        progress = 1
        for user in users:
            db_user = {
                'screen_name': user['screen_name'],
                'friends': user['friends'],
                'followers': user['followers'],
                'ff_ratio': self.__computer_ff_ratio(user['friends'], user['followers']),
                'interactions': user['interactions'],
                'tweets': user['tweets_count'],
                'original_tweets': user['original_count'],
                'rts': user['retweets_count'],
                'qts': user['quotes_count'],
                'rps': user['replies_count'],
                'verified': user['verified']

            }
            # Assign the party and movement to the party and movement that are more related to the user
            # counting both Hashtags and Mentions by the user
            user_parties = self.__dbm_tweets.get_party_user(user['screen_name'])
            user_parties_count = len(user_parties) or 0
            logging.debug('::. Network Analyzer: User {0} has {1} associated parties...'
                          .format(user['screen_name'],user_parties_count))

            if user_parties_count > 0:
                user_party = user_parties[0]
                db_user.update({'most_interacted_party': user_party['partido']})
                user_movements = self.__dbm_tweets.get_movement_user(user['screen_name'])
                user_movements_count = len(user_movements) or 0
                logging.debug('::. Network Analyzer: User {0} has {1} associated movements...'
                              .format(user['screen_name'], user_movements_count))
                if user_movements_count > 0:
                    user_movement = user_movements[0]
                    db_user.update({'most_interacted_movement': user_movement['movimiento']})
                else:
                    db_user.update({'most_interacted_movement': ''})
            else:
                db_user.update({'most_interacted_party': '', 'movement': ''})


            # Assign the party and movement to the party and movement that are more related to the user
            # counting both Hashtags and Mentions by the user
            upp = UserPoliticalPreference()
            user_party = upp.get_user_political_party(user['screen_name'])
            user_movement = upp.get_user_political_movement(user['screen_name'])
            db_user.update({'party': user_party, 'movement': user_movement})

            filter_query = {'screen_name': user['screen_name']}
            logging.debug('::. Network Analyzer: Updating/creating user {0} ({1}/{2})...'
                          .format(user['screen_name'], progress, users_count))
            progress += 1
            self.__dbm_users.update_record(filter_query, db_user, create_if_doesnt_exist=True)

    def generate_network(self, subnet_query={}, depth=1, file_name='network', override_net=False):
        net_query = subnet_query.copy()
        net_query.update({'depth': depth})
        ret_net = self.__dbm_networks.search(net_query)
        # the net doesn't exist yet, let's create it
        if ret_net.count() == 0 or override_net:
            logging.info('Generating the network, it can take several minutes, please wait_')
            users = self.__dbm_users.search(subnet_query)
            # for each user generate his/her edges
            for user in users:
                if 'ff_ratio' in user.keys():
                    u_ff_ratio = user['ff_ratio']
                else:
                    u_ff_ratio = self.__computer_ff_ratio(user['friends'], user['followers'])
                pbb_score = user['bot_analysis']['pbb'] if 'bot_analysis' in user.keys() else ''
                self.__nodes.add(tuple({'screen_name': user['screen_name'], 'party': user['party'],
                                        'movement': user['movement'], 'ff_ratio': u_ff_ratio,
                                        'pbb': pbb_score}.items()))
                for interacted_user, interactions in user['interactions'].items():
                    iuser = self.__dbm_users.find_record({'screen_name': interacted_user})
                    if not iuser:
                        if depth > 1:
                            iuser_ffratio = self.__get_ffratio(interacted_user)
                            if not iuser_ffratio:
                                self.__unknown_users.add(interacted_user)
                                continue
                        else:
                            self.__unknown_users.add(interacted_user)
                            continue
                    else:
                        if 'ff_ratio' in iuser.keys():
                            i_ff_ratio = iuser['ff_ratio']
                        else:
                            i_ff_ratio = self.__computer_ff_ratio(iuser['friends'], iuser['followers'])

                    pbb_iuser_score = user['bot_analysis']['pbb'] if 'bot_analysis' in iuser.keys() else ''
                    self.__nodes.add(tuple({'screen_name': iuser['screen_name'], 'party': iuser['party'],
                                            'movement': iuser['movement'], 'ff_ratio': i_ff_ratio,

                                            'pbb': pbb_iuser_score}.items()))
                    edge = {
                        'nodeA': {'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio,
                                  'party': user['party'], 'movement': user['movement'],
                                  'pbb': pbb_score},
                        'nodeB': {'screen_name': interacted_user, 'ff_ratio': i_ff_ratio,
                                  'party': iuser['party'], 'movement': iuser['movement'],
                                  'pbb': pbb_iuser_score},
                        'weight': interactions['total']
                    }
                    self.__network.append(edge)
            logging.info('Created a network of {0} nodes and {1} edges'.format(len(self.__nodes), len(self.__network)))
            logging.info('Unknown users {0}'.format(len(self.__unknown_users)))
            # save the net in a gefx file for posterior usage
            f_name = self.save_network_in_gexf_format(file_name)
            logging.info('Saved the network in the file {0}'.format(f_name))
            db_net = {'file_name': str(f_name)}
            db_net.update(net_query)
            self.__dbm_networks.save_record(db_net)
        else:
            f_net = ret_net[0]
            logging.info('The network was already generated, please find it at {0}'.format(f_net['file_name']))

    def create_graph(self):
        logging.info('Creating the graph, please wait_')
        self.__graph = net.DiGraph()
        ff_ratio = defaultdict(lambda: 0.0)
        # create a directed graph from the edge data and populate a dictionary
        # with the friends/followers ratio
        for edge in self.__network:
            user = edge['nodeA']['screen_name']
            interacted_with = edge['nodeB']['screen_name']
            num_interactions = edge['weight']
            u_ff_ratio = edge['nodeA']['ff_ratio']
            self.__graph.add_edge(user, interacted_with, weight=int(num_interactions))
            ff_ratio[user] = float(u_ff_ratio)
        # obtain central node
        # degrees = net.degree(self.__graph)
        # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1]
        # center the graph around the central node
        # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node))
        return

    def get_graph_nodes(self):
        return len(self.__nodes)

    def get_graph_edges(self):
        return len(self.__network)

    def get_graph(self):
        return self.__graph

    def get_node_sizes(self):
        return self.__node_sizes

    def __get_ffratio(self, screen_name):
        query = {
            '$or': [
                {'tweet_obj.user.screen_name': screen_name},
                {'tweet_obj.retweeted_status.user.screen_name': screen_name},
                {'tweet_obj.quoted_status.user.screen_name': screen_name}
            ]
        }
        tweet_obj = self.__dbm_tweets.find_record(query)
        if tweet_obj:
            tweet = tweet_obj['tweet_obj']
            if 'retweeted_status' in tweet.keys():
                return self.__computer_ff_ratio(tweet['retweeted_status']['user']['friends_count'],
                                                tweet['retweeted_status']['user']['followers_count'])
            elif 'quoted_status' in tweet.keys():
                return self.__computer_ff_ratio(tweet['quoted_status']['user']['friends_count'],
                                                tweet['quoted_status']['user']['followers_count'])
            else:
                return self.__computer_ff_ratio(tweet['user']['friends_count'],
                                                tweet['user']['followers_count'])
        else:
            return None

    def save_network_in_gexf_format(self, file_name):
        today = datetime.strftime(datetime.now(), '%m/%d/%y')
        f_name = pathlib.Path(__file__).parents[2].joinpath('sna', 'gefx', file_name+'.gexf')
        with open(str(f_name), 'w', encoding='utf-8') as f:
            f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            f.write('<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" '
                    'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
                    'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" '
                    'version="1.2">\n')
            f.write('<meta lastmodifieddate="{0}">\n'.format(today))
            f.write('<creator>PoliticBots</creator>\n')
            f.write('<description>{0}</description>\n'.format(file_name))
            f.write('</meta>\n')
            f.write('<graph mode="static" defaultedgetype="directed">\n')
            # add data attributes
            f.write('<attributes class="node">\n')
            f.write('<attribute id="0" title="party" type="string"/>\n')
            f.write('<attribute id="1" title="movement" type="string"/>\n')
            f.write('<attribute id="2" title="ff_ratio" type="float"/>\n')
            f.write('<attribute id="3" title="pbb" type="float"/>\n')
            f.write('</attributes>\n')
            # add nodes
            f.write('<nodes>\n')
            node_id = 0
            list_nodes = []
            for node_tup in self.__nodes:
                node = dict(node_tup)
                f.write('<node id="{0}" label="{1}">\n'.format(node_id, node['screen_name']))
                f.write('<attvalues>\n')
                f.write('<attvalue for="0" value="{0}"/>\n'.format(node['party']))
                f.write('<attvalue for="1" value="{0}"/>\n'.format(node['movement']))
                f.write('<attvalue for="2" value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('<attvalue for="3" value="{0}"/>\n'.format(node['pbb']))
                f.write('</attvalues>\n')
                #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('</node>\n')
                node_id += 1
                list_nodes.append(node['screen_name'])
            f.write('</nodes>\n')
            # add edges
            f.write('<edges>\n')
            edge_id = 0
            for edge in list(self.__network):
                id_vertexA = list_nodes.index(edge['nodeA']['screen_name'])
                id_vertexB = list_nodes.index(edge['nodeB']['screen_name'])
                weight = edge['weight']
                f.write('<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'.format(edge_id, id_vertexA,
                                                                                           id_vertexB, weight))
                edge_id += 1
            f.write('</edges>\n')
            f.write('</graph>\n')
            f.write('</gexf>\n')
        return f_name
Ejemplo n.º 3
0
class TweetEvaluator:
    special_chars = r'[=\+/&<>;:\'\"\?%$!¡\,\. \t\r\n]+'
    hashtags, user_handlers = [], []
    __dbm = None
    BATCH_SIZE = 1000

    def __init__(self):
        self.user_handlers, self.hashtags = get_user_handlers_and_hashtags()
        self.__dbm = DBManager('tweets')

    def __is_relevant(self, users_counter, hashtags_counter):
        # a tweet is considered relevant if fulfills one of two
        # conditions; candidates are mentioned or if candidates are
        # are not mentioned but there are at least more than one
        # campaign hashtag
        if users_counter > 0 or hashtags_counter > 1:
            return True
        else:
            return False

    def __assess_tweet_by_text(self, tweet_text):
        tweet_text = re.sub(u'\u2026', '',
                            tweet_text)  # remove ellipsis unicode char
        users_counter, hashtags_counter = 0, 0
        for token in tweet_text.split():
            token = re.sub(self.special_chars, '',
                           token)  # remove special chars
            if token.lower() in self.user_handlers:
                users_counter += 1
            if token.lower() in self.hashtags:
                hashtags_counter += 1
        return self.__is_relevant(users_counter, hashtags_counter)

    def __assess_tweet_by_entities(self, tweet_hashtags, tweet_mentions):
        users_counter, hashtags_counter = 0, 0
        for tweet_hashtag in tweet_hashtags:
            tweet_hashtag_txt = '#' + tweet_hashtag['text'].lower()
            if tweet_hashtag_txt in self.hashtags:
                hashtags_counter += 1
        for tweet_mention in tweet_mentions:
            screen_name = '@' + tweet_mention['screen_name'].lower()
            if screen_name in self.user_handlers:
                users_counter += 1
        return self.__is_relevant(users_counter, hashtags_counter)

    def is_tweet_relevant(self, tweet):
        tweet_author = tweet['user']['screen_name']
        tweet_handler = '@{0}'.format(tweet_author.lower())
        if tweet_handler in self.user_handlers:
            return True
        else:
            if 'retweeted_status' in tweet.keys():
                original_tweet = tweet['retweeted_status']
            else:
                original_tweet = tweet
            if 'entities' in original_tweet.keys():
                t_user_mentions = original_tweet['entities']['user_mentions']
                t_hashtags = original_tweet['entities']['hashtags']
                return self.__assess_tweet_by_entities(t_hashtags,
                                                       t_user_mentions)
            else:
                if 'full_text' in original_tweet.keys():
                    return self.__assess_tweet_by_text(tweet['full_text'])
                else:
                    return self.__assess_tweet_by_text(tweet['text'])

    def __mark_relevance_rt(self, tweet_reg):
        logging.info('Marking RTS...')
        query = {
            'tweet_obj.retweeted_status': {
                '$exists': 1
            },
            'tweet_obj.retweeted_status.id_str': {
                '$eq': tweet_reg['tweet_obj']['id_str']
            },
            'relevante': {
                '$ne': tweet_reg['relevante']
            }
        }
        update = {'$set': {'relevante': tweet_reg['relevante']}}
        update_res = self.__dbm.update_record_many(query, update)
        logging.info('Marked {0} RTS...'.format(update_res.matched_count))

    def identify_relevant_tweets(self):
        # select only original tweets that are not marked as relevant
        query = {
            'relevante': {
                '$exists': 0
            },
            'tweet_obj.retweeted_status': {
                '$exists': 0
            }
        }
        logging.info('Relevant Tweets: Running query to count...')
        # processing by batch as workaround cursor not found error
        total_tweets = self.__dbm.search(query,
                                         only_relevant_tws=False).count()
        total_batches = ceil(total_tweets / self.BATCH_SIZE)
        batch = 1
        moreToProcess = batch <= total_batches

        while moreToProcess:
            logging.info(
                'Querying records in batches of {0} records...'.format(
                    self.BATCH_SIZE))
            search_res = self.__dbm.search(
                query, only_relevant_tws=False).limit(self.BATCH_SIZE)
            logging.info('Loading batch {0}/{1} into memory...'.format(
                batch, total_batches))
            tweets = [doc for doc in search_res]
            total_tweets_batch = self.BATCH_SIZE
            if batch == total_batches:
                total_tweets_batch = len(tweets)
            logging.info(
                'Identifying relevant tweets in batch {0}/{1} out of {2} tweets...'
                .format(batch, total_batches, total_tweets_batch))
            tweet_counter = 0
            try:
                for tweet_reg in tweets:
                    tweet_counter += 1
                    tweet = tweet_reg['tweet_obj']
                    if self.is_tweet_relevant(tweet):
                        tweet_reg['relevante'] = 1
                        logging.info(
                            'Identifying {0}/{1} tweets (relevant)'.format(
                                tweet_counter, total_tweets))
                    else:
                        tweet_reg['relevante'] = 0
                        logging.info(
                            'Identifying {0}/{1} tweets (irrelevant)'.format(
                                tweet_counter, total_tweets))
                    self.__dbm.update_record(
                        {'tweet_obj.id_str': tweet['id_str']}, tweet_reg)
                    # copy the relevance flag to rts
                    self.__mark_relevance_rt(tweet_reg)

                logging.info(
                    'Finished identifying relevant tweets in batch {0}/{1} out of {2} tweets...'
                    .format(batch, total_batches, total_tweets_batch))
                batch += 1
                moreToProcess = batch <= total_batches
            except Exception as e:
                logging.info("Exception occurred...")
                logging.info("Exception message {0}".format(e))

        logging.info('Finished identifying relevant tweets...')
        return True

    # set to 'user' the type of tweets which keyword contains @
    def fix_tweet_type(self):
        query = {'type': 'hashtag', 'keyword': {'$regex': '@'}}
        objs = self.__dbm.search(query)
        num_fixed_tweets = objs.count()
        for obj in objs:
            obj['type'] = 'user'
            self.__dbm.save_record(obj)
        return num_fixed_tweets

    def __get_hashtags(self, hashtags_list):
        hts = []
        for ht in hashtags_list:
            hts.append(ht['text'])
        return hts

    def __get_screen_names(self, screen_names_list):
        scs = []
        for sc in screen_names_list:
            scs.append('@' + sc['screen_name'])
        return scs

    # fix value of candidatura if hashtags related to a candidacy
    # are present in the text of the tweet
    def fix_value_of_candidatura(self):
        script_parent_dir = pathlib.Path(__file__).parents[1]
        config_fn = script_parent_dir.joinpath('config.json')
        configuration = get_config(config_fn)
        keyword, k_metadata = parse_metadata(configuration['metadata'])
        interested_data = []
        # keep metadata that refer to candidacies
        for kword, kmetada in zip(keyword, k_metadata):
            if kmetada['candidatura'] != '':
                kmetada.update({'keyword': kword})
                interested_data.append(kmetada)
        query = {'candidatura': ''}
        # select tweets without candidacy
        s_objs = self.__dbm.search(query)
        num_fixed_tweets = 0
        # iterate over tweets without candidacy and fix those
        # whose text mention a candidate or have hashtags
        # related to a candidacy
        for s_obj in s_objs:
            party = s_obj['partido_politico']
            movement = s_obj['movimiento']
            tweet = s_obj['tweet_obj']
            relevant_data = []
            candidacy = ''
            # keep metadata related to the political party
            # (and movement) of the tweet (s_obj)
            for ida in interested_data:
                if ida['partido_politico'] == party:
                    if movement != '':
                        if ida['movimiento'] == movement:
                            relevant_data.append(ida)
                    else:
                        relevant_data.append(ida)
            if len(relevant_data) > 0:
                # extract relevant information of the tweet. hashtags and mentions if
                # the tweet obj has these entities otherwise the text of the tweet
                if 'retweeted_status' in tweet.keys():
                    original_tweet = tweet['retweeted_status']
                else:
                    original_tweet = tweet
                if 'entities' in original_tweet.keys():
                    t_user_mentions = self.__get_screen_names(
                        original_tweet['entities']['user_mentions'])
                    t_hashtags = self.__get_hashtags(
                        original_tweet['entities']['hashtags'])
                    # see if the interested keywords are part of the tweet hashtags or mentions
                    for rd in relevant_data:
                        if rd['keyword'] in t_user_mentions:
                            candidacy = rd['candidatura']
                            break
                        else:
                            if rd['keyword'] in t_hashtags:
                                candidacy = rd['candidatura']
                                break
                else:
                    if 'full_text' in original_tweet.keys():
                        t_text = tweet['full_text']
                    else:
                        t_text = tweet['text']
                    # see if the interested keywords are present in the text
                    for rd in relevant_data:
                        if rd['keyword'] in t_text:
                            candidacy = rd['candidatura']
                            break
                # fix candidacy key
                if candidacy:
                    s_obj['candidatura'] = candidacy
                    num_fixed_tweets += 1
                    self.__dbm.save_record(s_obj)
        return num_fixed_tweets