def save_original_tweets_file(): dbm = DBManager('tweets') query = { 'tweet_obj.retweeted_status': { '$exists': 0 }, 'sentimiento': { '$exists': 1 }, } s_objs = dbm.search(query) with open('tweet_sentiments.csv', 'w', encoding='utf-8') as f_csv: fieldnames = ['id', 'text', 'tone', 'score'] writer = csv.DictWriter(f_csv, fieldnames=fieldnames) writer.writeheader() for s_obj in s_objs: tweet = s_obj['tweet_obj'] if 'full_text' in tweet.keys(): tweet_text = tweet['full_text'] else: tweet_text = tweet['text'] tweet_text = clean_emojis(tweet_text) tweet_text = tweet_text.replace('\r', '') tweet_text = tweet_text.replace('\n', '') tweet_text = tweet_text.replace(',', '') tweet_dict = { 'id': tweet['id_str'], 'text': tweet_text, 'tone': s_obj['sentimiento']['tono'], 'score': s_obj['sentimiento']['score'] } writer.writerow(tweet_dict)
def compute_bot_probability(self, users, source_users_collection = "", source_users_db = ""): reusers_db = None if source_users_db and source_users_collection: reusers_db = DBManager(source_users_collection, source_users_db) if not users: # Get all users who don't have the analysis of bot in current user users = self.__dbm_users.search({'bot_analysis': {'$exists': 0}}) tot_user = len(users) if type(users) == list else users.count() idx_user = 1 for user in users: logging.info('Remaining users: {0}'.format(tot_user-idx_user)) if reusers_db: reuser_cursor = reusers_db.search({'screen_name': user['screen_name']}) if reuser_cursor.count() > 0: logging.info('Reusing bot analysis from another DB for {0}'.format(user['screen_name'])) reuser = reuser_cursor[0] bot_analysis = reuser['bot_analysis'] self.__save_user_pbb(reuser['screen_name'], bot_analysis['pbb'], bot_analysis['raw_score'], bot_analysis['features'], bot_analysis['num_evaluated_heuristics'], bot_analysis['sum_weights'], reuser['exists']) continue if type(users) == list: user_screen_name = user else: user_screen_name = user['screen_name'] self.__compute_heuristics(user_screen_name) idx_user += 1
def fix_tweets_with_empty_flags(): dbm = DBManager('tweets') script_parent_dir = pathlib.Path(__file__).parents[1] conf_file = script_parent_dir.joinpath('config.json') configuration = get_config(conf_file) keyword, k_metadata = parse_metadata(configuration['metadata']) tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1}) for tweet in tweets_with_empty_flags: logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str'])) flag, headers = create_flag(k_metadata) entities = get_entities_tweet(tweet['tweet_obj']) flag = add_values_to_flags(flag, entities, k_metadata) dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag) #if __name__ == '__main__': # fix_tweets_with_empty_flags()
def compute_tweets_local_date(force_computation=False, include_hour=False): dbm = DBManager('tweets') if force_computation: query = {} else: query = {'tweet_py_datetime': {'$exists': 0}} s_objs = dbm.search(query, only_relevant_tws=False) for s_obj in s_objs: tweet = s_obj['tweet_obj'] py_pub_dt = get_py_date(tweet) dict_to_update = { 'tweet_py_datetime': datetime.strftime(py_pub_dt, '%m/%d/%y %H:%M:%S'), 'tweet_py_date': datetime.strftime(py_pub_dt, '%m/%d/%y') } if include_hour: dict_to_update.update( {'tweet_py_hour': datetime.strftime(py_pub_dt, '%H')}) dbm.update_record({'tweet_obj.id_str': tweet['id_str']}, dict_to_update) return
class NetworkAnalyzer: __dbm_tweets = None __dbm_users = None __dbm_networks = None __network = None __graph = None __nodes = set() __unknown_users = set() __node_sizes = None def __init__(self): self.__dbm_tweets = DBManager('tweets') self.__dbm_users = DBManager('users') self.__dbm_networks = DBManager('networks') self.__network = [] def __computer_ff_ratio(self, friends, followers): if followers > 0 and friends > 0: return friends / followers else: return 0 # Get interactions in of a given users def get_in_interactions(self, user_screen_name): # compute in interactions, meaning, interactions in which the user # was mentioned, retweeted, quoted, replied in_inter_query = {'interactions.' + user_screen_name: {'$exists': 1}, 'screen_name': {'$ne': user_screen_name}} n_users = self.__dbm_users.search(in_inter_query) in_interactions_dict, in_rts, in_rps = {}, {}, {} in_qts, in_mts = {}, {} total_in_interactions = 0 total_in_retweets, total_in_replies = 0, 0 total_in_mentions, total_in_quotes = 0, 0 for n_user in n_users: n_user_interactions = n_user['interactions'] for i_user, interactions in n_user_interactions.items(): if i_user == user_screen_name: in_interactions_dict[n_user['screen_name']] = interactions['total'] total_in_interactions += interactions['total'] if 'retweets' in interactions.keys(): total_in_retweets += interactions['retweets'] in_rts[n_user['screen_name']] = interactions['retweets'] if 'replies' in interactions.keys(): total_in_replies += interactions['replies'] in_rps[n_user['screen_name']] = interactions['replies'] if 'mentions' in interactions.keys(): total_in_mentions += interactions['mentions'] in_mts[n_user['screen_name']] = interactions['mentions'] if 'quotes' in interactions.keys(): total_in_quotes += interactions['quotes'] in_qts[n_user['screen_name']] = interactions['quotes'] in_interactions_obj = { 'total': { 'count': total_in_interactions, 'details': in_interactions_dict }, 'replies': { 'count': total_in_replies, 'details': in_rps }, 'retweets': { 'count': total_in_retweets, 'details': in_rts }, 'mentions': { 'count': total_in_mentions, 'details': in_mts }, 'quotes': { 'count': total_in_quotes, 'details': in_qts } } user_dict = { 'in_interactions': in_interactions_obj } return user_dict # Get interactions out of a given users def get_out_interactions(self, user_screen_name): user = self.__dbm_users.search({'screen_name': user_screen_name})[0] # compute out interactions, meaning, interactions originated by # the user user_interactions = user['interactions'] out_interactions_dict, out_rts = {}, {} out_rps, out_qts, out_mts = {}, {}, {} total_out_interactions, total_out_retweets = 0, 0 total_out_mentions, total_out_replies = 0, 0 total_out_quotes = 0 for recipient, interactions in user_interactions.items(): out_interactions_dict[recipient] = interactions['total'] total_out_interactions += interactions['total'] if 'retweets' in interactions: total_out_retweets += interactions['retweets'] out_rts[recipient] = interactions['retweets'] if 'replies' in interactions: total_out_replies += interactions['replies'] out_rps[recipient] = interactions['replies'] if 'mentions' in interactions: total_out_mentions += interactions['mentions'] out_mts[recipient] = interactions['mentions'] if 'quotes' in interactions: total_out_quotes += interactions['quotes'] out_qts[recipient] = interactions['quotes'] out_interactions_obj = { 'total': { 'count': total_out_interactions, 'details': out_interactions_dict }, 'replies': { 'count': total_out_replies, 'details': out_rps }, 'retweets': { 'count': total_out_retweets, 'details': out_rts }, 'mentions': { 'count': total_out_mentions, 'details': out_mts }, 'quotes': { 'count': total_out_quotes, 'details': out_qts } } # compile all information in a dictionary user_dict = { 'out_interactions': out_interactions_obj } return user_dict def create_users_db(self, clear_collection=False): logging.info('::. Network Analyzer: Creating database of users, it can take several minutes, please wait_') if clear_collection: self.__dbm_users.clear_collection() users = self.__dbm_tweets.get_unique_users() users_count = len(users) logging.info('::. Network Analyzer: Extracted {0} unique users from the database...'.format(users_count)) progress = 1 for user in users: db_user = { 'screen_name': user['screen_name'], 'friends': user['friends'], 'followers': user['followers'], 'ff_ratio': self.__computer_ff_ratio(user['friends'], user['followers']), 'interactions': user['interactions'], 'tweets': user['tweets_count'], 'original_tweets': user['original_count'], 'rts': user['retweets_count'], 'qts': user['quotes_count'], 'rps': user['replies_count'], 'verified': user['verified'] } # Assign the party and movement to the party and movement that are more related to the user # counting both Hashtags and Mentions by the user user_parties = self.__dbm_tweets.get_party_user(user['screen_name']) user_parties_count = len(user_parties) or 0 logging.debug('::. Network Analyzer: User {0} has {1} associated parties...' .format(user['screen_name'],user_parties_count)) if user_parties_count > 0: user_party = user_parties[0] db_user.update({'most_interacted_party': user_party['partido']}) user_movements = self.__dbm_tweets.get_movement_user(user['screen_name']) user_movements_count = len(user_movements) or 0 logging.debug('::. Network Analyzer: User {0} has {1} associated movements...' .format(user['screen_name'], user_movements_count)) if user_movements_count > 0: user_movement = user_movements[0] db_user.update({'most_interacted_movement': user_movement['movimiento']}) else: db_user.update({'most_interacted_movement': ''}) else: db_user.update({'most_interacted_party': '', 'movement': ''}) # Assign the party and movement to the party and movement that are more related to the user # counting both Hashtags and Mentions by the user upp = UserPoliticalPreference() user_party = upp.get_user_political_party(user['screen_name']) user_movement = upp.get_user_political_movement(user['screen_name']) db_user.update({'party': user_party, 'movement': user_movement}) filter_query = {'screen_name': user['screen_name']} logging.debug('::. Network Analyzer: Updating/creating user {0} ({1}/{2})...' .format(user['screen_name'], progress, users_count)) progress += 1 self.__dbm_users.update_record(filter_query, db_user, create_if_doesnt_exist=True) def generate_network(self, subnet_query={}, depth=1, file_name='network', override_net=False): net_query = subnet_query.copy() net_query.update({'depth': depth}) ret_net = self.__dbm_networks.search(net_query) # the net doesn't exist yet, let's create it if ret_net.count() == 0 or override_net: logging.info('Generating the network, it can take several minutes, please wait_') users = self.__dbm_users.search(subnet_query) # for each user generate his/her edges for user in users: if 'ff_ratio' in user.keys(): u_ff_ratio = user['ff_ratio'] else: u_ff_ratio = self.__computer_ff_ratio(user['friends'], user['followers']) pbb_score = user['bot_analysis']['pbb'] if 'bot_analysis' in user.keys() else '' self.__nodes.add(tuple({'screen_name': user['screen_name'], 'party': user['party'], 'movement': user['movement'], 'ff_ratio': u_ff_ratio, 'pbb': pbb_score}.items())) for interacted_user, interactions in user['interactions'].items(): iuser = self.__dbm_users.find_record({'screen_name': interacted_user}) if not iuser: if depth > 1: iuser_ffratio = self.__get_ffratio(interacted_user) if not iuser_ffratio: self.__unknown_users.add(interacted_user) continue else: self.__unknown_users.add(interacted_user) continue else: if 'ff_ratio' in iuser.keys(): i_ff_ratio = iuser['ff_ratio'] else: i_ff_ratio = self.__computer_ff_ratio(iuser['friends'], iuser['followers']) pbb_iuser_score = user['bot_analysis']['pbb'] if 'bot_analysis' in iuser.keys() else '' self.__nodes.add(tuple({'screen_name': iuser['screen_name'], 'party': iuser['party'], 'movement': iuser['movement'], 'ff_ratio': i_ff_ratio, 'pbb': pbb_iuser_score}.items())) edge = { 'nodeA': {'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio, 'party': user['party'], 'movement': user['movement'], 'pbb': pbb_score}, 'nodeB': {'screen_name': interacted_user, 'ff_ratio': i_ff_ratio, 'party': iuser['party'], 'movement': iuser['movement'], 'pbb': pbb_iuser_score}, 'weight': interactions['total'] } self.__network.append(edge) logging.info('Created a network of {0} nodes and {1} edges'.format(len(self.__nodes), len(self.__network))) logging.info('Unknown users {0}'.format(len(self.__unknown_users))) # save the net in a gefx file for posterior usage f_name = self.save_network_in_gexf_format(file_name) logging.info('Saved the network in the file {0}'.format(f_name)) db_net = {'file_name': str(f_name)} db_net.update(net_query) self.__dbm_networks.save_record(db_net) else: f_net = ret_net[0] logging.info('The network was already generated, please find it at {0}'.format(f_net['file_name'])) def create_graph(self): logging.info('Creating the graph, please wait_') self.__graph = net.DiGraph() ff_ratio = defaultdict(lambda: 0.0) # create a directed graph from the edge data and populate a dictionary # with the friends/followers ratio for edge in self.__network: user = edge['nodeA']['screen_name'] interacted_with = edge['nodeB']['screen_name'] num_interactions = edge['weight'] u_ff_ratio = edge['nodeA']['ff_ratio'] self.__graph.add_edge(user, interacted_with, weight=int(num_interactions)) ff_ratio[user] = float(u_ff_ratio) # obtain central node # degrees = net.degree(self.__graph) # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1] # center the graph around the central node # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node)) return def get_graph_nodes(self): return len(self.__nodes) def get_graph_edges(self): return len(self.__network) def get_graph(self): return self.__graph def get_node_sizes(self): return self.__node_sizes def __get_ffratio(self, screen_name): query = { '$or': [ {'tweet_obj.user.screen_name': screen_name}, {'tweet_obj.retweeted_status.user.screen_name': screen_name}, {'tweet_obj.quoted_status.user.screen_name': screen_name} ] } tweet_obj = self.__dbm_tweets.find_record(query) if tweet_obj: tweet = tweet_obj['tweet_obj'] if 'retweeted_status' in tweet.keys(): return self.__computer_ff_ratio(tweet['retweeted_status']['user']['friends_count'], tweet['retweeted_status']['user']['followers_count']) elif 'quoted_status' in tweet.keys(): return self.__computer_ff_ratio(tweet['quoted_status']['user']['friends_count'], tweet['quoted_status']['user']['followers_count']) else: return self.__computer_ff_ratio(tweet['user']['friends_count'], tweet['user']['followers_count']) else: return None def save_network_in_gexf_format(self, file_name): today = datetime.strftime(datetime.now(), '%m/%d/%y') f_name = pathlib.Path(__file__).parents[2].joinpath('sna', 'gefx', file_name+'.gexf') with open(str(f_name), 'w', encoding='utf-8') as f: f.write('<?xml version="1.0" encoding="UTF-8"?>\n') f.write('<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" ' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' 'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" ' 'version="1.2">\n') f.write('<meta lastmodifieddate="{0}">\n'.format(today)) f.write('<creator>PoliticBots</creator>\n') f.write('<description>{0}</description>\n'.format(file_name)) f.write('</meta>\n') f.write('<graph mode="static" defaultedgetype="directed">\n') # add data attributes f.write('<attributes class="node">\n') f.write('<attribute id="0" title="party" type="string"/>\n') f.write('<attribute id="1" title="movement" type="string"/>\n') f.write('<attribute id="2" title="ff_ratio" type="float"/>\n') f.write('<attribute id="3" title="pbb" type="float"/>\n') f.write('</attributes>\n') # add nodes f.write('<nodes>\n') node_id = 0 list_nodes = [] for node_tup in self.__nodes: node = dict(node_tup) f.write('<node id="{0}" label="{1}">\n'.format(node_id, node['screen_name'])) f.write('<attvalues>\n') f.write('<attvalue for="0" value="{0}"/>\n'.format(node['party'])) f.write('<attvalue for="1" value="{0}"/>\n'.format(node['movement'])) f.write('<attvalue for="2" value="{0}"/>\n'.format(node['ff_ratio'])) f.write('<attvalue for="3" value="{0}"/>\n'.format(node['pbb'])) f.write('</attvalues>\n') #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio'])) f.write('</node>\n') node_id += 1 list_nodes.append(node['screen_name']) f.write('</nodes>\n') # add edges f.write('<edges>\n') edge_id = 0 for edge in list(self.__network): id_vertexA = list_nodes.index(edge['nodeA']['screen_name']) id_vertexB = list_nodes.index(edge['nodeB']['screen_name']) weight = edge['weight'] f.write('<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'.format(edge_id, id_vertexA, id_vertexB, weight)) edge_id += 1 f.write('</edges>\n') f.write('</graph>\n') f.write('</gexf>\n') return f_name
class HashtagDiscoverer: user_handlers, hashtags = [], [] __dbm = None def __init__(self): self.user_handlers, self.hashtags = get_user_handlers_and_hashtags() self.__dbm = DBManager('tweets') def discover_hashtags_by_text(self, tweet_text): new_hashtags = set() for token in tweet_text.split(): if u'\u2026' in token: continue if '#' in token: if token.lower() not in self.hashtags: new_hashtags.add(token) return new_hashtags def discover_hashtags_by_entities(self, tweet_hashtags): new_hashtags = set() for tweet_hashtag in tweet_hashtags: tweet_hashtag_txt = '#' + tweet_hashtag['text'].lower() if u'\u2026' in tweet_hashtag_txt: continue if tweet_hashtag_txt not in self.hashtags: new_hashtags.add('#' + tweet_hashtag['text']) return new_hashtags def discover_new_hashtags(self, query={}, sorted_results=True): tweet_regs = self.__dbm.search(query) new_hashtags = defaultdict(int) for tweet_reg in tweet_regs: tweet = tweet_reg['tweet_obj'] if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_hashtags = original_tweet['entities']['hashtags'] discovered_hashtags = self.discover_hashtags_by_entities( t_hashtags) else: if 'full_text' in original_tweet.keys(): tweet_text = tweet['full_text'] else: tweet_text = tweet['text'] discovered_hashtags = self.discover_hashtags_by_text( tweet_text) for discovered_hashtag in discovered_hashtags: new_hashtags[discovered_hashtag] += 1 if sorted_results: return [(k, new_hashtags[k]) for k in sorted( new_hashtags, key=new_hashtags.get, reverse=True)] else: return new_hashtags def discover_coccurrence_hashtags_by_text(self, tweet_text): known_hashtags = set() for token in tweet_text.split(): if u'\u2026' in token: continue if '#' in token: if token.lower() in self.hashtags: known_hashtags.add(token) if len(known_hashtags) > 0: return ' '.join(known_hashtags) else: return None def discover_coccurrence_hashtags_by_entities(self, tweet_hashtags): known_hashtags = set() for tweet_hashtag in tweet_hashtags: tweet_hashtag_txt = '#' + tweet_hashtag['text'].lower() if u'\u2026' in tweet_hashtag_txt: continue if tweet_hashtag_txt in self.hashtags: known_hashtags.add('#' + tweet_hashtag['text']) if len(known_hashtags) > 0: return ' '.join(known_hashtags) else: return None def coccurence_hashtags(self, query={}, sorted_results=True): coccurence_hashtags_dict = defaultdict(int) tweet_regs = self.__dbm.search(query) for tweet_reg in tweet_regs: tweet = tweet_reg['tweet_obj'] if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_hashtags = original_tweet['entities']['hashtags'] coccurrence_hashtag_str = self.discover_coccurrence_hashtags_by_entities( t_hashtags) else: if 'full_text' in original_tweet.keys(): tweet_text = tweet['full_text'] else: tweet_text = tweet['text'] coccurrence_hashtag_str = self.discover_coccurrence_hashtags_by_text( tweet_text) if coccurrence_hashtag_str: coccurence_hashtags_dict[coccurrence_hashtag_str] += 1 if sorted_results: return [(k, coccurence_hashtags_dict[k]) for k in sorted(coccurence_hashtags_dict, key=coccurence_hashtags_dict.get, reverse=True)] else: return coccurence_hashtags_dict
class TweetEvaluator: special_chars = r'[=\+/&<>;:\'\"\?%$!¡\,\. \t\r\n]+' hashtags, user_handlers = [], [] __dbm = None BATCH_SIZE = 1000 def __init__(self): self.user_handlers, self.hashtags = get_user_handlers_and_hashtags() self.__dbm = DBManager('tweets') def __is_relevant(self, users_counter, hashtags_counter): # a tweet is considered relevant if fulfills one of two # conditions; candidates are mentioned or if candidates are # are not mentioned but there are at least more than one # campaign hashtag if users_counter > 0 or hashtags_counter > 1: return True else: return False def __assess_tweet_by_text(self, tweet_text): tweet_text = re.sub(u'\u2026', '', tweet_text) # remove ellipsis unicode char users_counter, hashtags_counter = 0, 0 for token in tweet_text.split(): token = re.sub(self.special_chars, '', token) # remove special chars if token.lower() in self.user_handlers: users_counter += 1 if token.lower() in self.hashtags: hashtags_counter += 1 return self.__is_relevant(users_counter, hashtags_counter) def __assess_tweet_by_entities(self, tweet_hashtags, tweet_mentions): users_counter, hashtags_counter = 0, 0 for tweet_hashtag in tweet_hashtags: tweet_hashtag_txt = '#' + tweet_hashtag['text'].lower() if tweet_hashtag_txt in self.hashtags: hashtags_counter += 1 for tweet_mention in tweet_mentions: screen_name = '@' + tweet_mention['screen_name'].lower() if screen_name in self.user_handlers: users_counter += 1 return self.__is_relevant(users_counter, hashtags_counter) def is_tweet_relevant(self, tweet): tweet_author = tweet['user']['screen_name'] tweet_handler = '@{0}'.format(tweet_author.lower()) if tweet_handler in self.user_handlers: return True else: if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_user_mentions = original_tweet['entities']['user_mentions'] t_hashtags = original_tweet['entities']['hashtags'] return self.__assess_tweet_by_entities(t_hashtags, t_user_mentions) else: if 'full_text' in original_tweet.keys(): return self.__assess_tweet_by_text(tweet['full_text']) else: return self.__assess_tweet_by_text(tweet['text']) def __mark_relevance_rt(self, tweet_reg): logging.info('Marking RTS...') query = { 'tweet_obj.retweeted_status': { '$exists': 1 }, 'tweet_obj.retweeted_status.id_str': { '$eq': tweet_reg['tweet_obj']['id_str'] }, 'relevante': { '$ne': tweet_reg['relevante'] } } update = {'$set': {'relevante': tweet_reg['relevante']}} update_res = self.__dbm.update_record_many(query, update) logging.info('Marked {0} RTS...'.format(update_res.matched_count)) def identify_relevant_tweets(self): # select only original tweets that are not marked as relevant query = { 'relevante': { '$exists': 0 }, 'tweet_obj.retweeted_status': { '$exists': 0 } } logging.info('Relevant Tweets: Running query to count...') # processing by batch as workaround cursor not found error total_tweets = self.__dbm.search(query, only_relevant_tws=False).count() total_batches = ceil(total_tweets / self.BATCH_SIZE) batch = 1 moreToProcess = batch <= total_batches while moreToProcess: logging.info( 'Querying records in batches of {0} records...'.format( self.BATCH_SIZE)) search_res = self.__dbm.search( query, only_relevant_tws=False).limit(self.BATCH_SIZE) logging.info('Loading batch {0}/{1} into memory...'.format( batch, total_batches)) tweets = [doc for doc in search_res] total_tweets_batch = self.BATCH_SIZE if batch == total_batches: total_tweets_batch = len(tweets) logging.info( 'Identifying relevant tweets in batch {0}/{1} out of {2} tweets...' .format(batch, total_batches, total_tweets_batch)) tweet_counter = 0 try: for tweet_reg in tweets: tweet_counter += 1 tweet = tweet_reg['tweet_obj'] if self.is_tweet_relevant(tweet): tweet_reg['relevante'] = 1 logging.info( 'Identifying {0}/{1} tweets (relevant)'.format( tweet_counter, total_tweets)) else: tweet_reg['relevante'] = 0 logging.info( 'Identifying {0}/{1} tweets (irrelevant)'.format( tweet_counter, total_tweets)) self.__dbm.update_record( {'tweet_obj.id_str': tweet['id_str']}, tweet_reg) # copy the relevance flag to rts self.__mark_relevance_rt(tweet_reg) logging.info( 'Finished identifying relevant tweets in batch {0}/{1} out of {2} tweets...' .format(batch, total_batches, total_tweets_batch)) batch += 1 moreToProcess = batch <= total_batches except Exception as e: logging.info("Exception occurred...") logging.info("Exception message {0}".format(e)) logging.info('Finished identifying relevant tweets...') return True # set to 'user' the type of tweets which keyword contains @ def fix_tweet_type(self): query = {'type': 'hashtag', 'keyword': {'$regex': '@'}} objs = self.__dbm.search(query) num_fixed_tweets = objs.count() for obj in objs: obj['type'] = 'user' self.__dbm.save_record(obj) return num_fixed_tweets def __get_hashtags(self, hashtags_list): hts = [] for ht in hashtags_list: hts.append(ht['text']) return hts def __get_screen_names(self, screen_names_list): scs = [] for sc in screen_names_list: scs.append('@' + sc['screen_name']) return scs # fix value of candidatura if hashtags related to a candidacy # are present in the text of the tweet def fix_value_of_candidatura(self): script_parent_dir = pathlib.Path(__file__).parents[1] config_fn = script_parent_dir.joinpath('config.json') configuration = get_config(config_fn) keyword, k_metadata = parse_metadata(configuration['metadata']) interested_data = [] # keep metadata that refer to candidacies for kword, kmetada in zip(keyword, k_metadata): if kmetada['candidatura'] != '': kmetada.update({'keyword': kword}) interested_data.append(kmetada) query = {'candidatura': ''} # select tweets without candidacy s_objs = self.__dbm.search(query) num_fixed_tweets = 0 # iterate over tweets without candidacy and fix those # whose text mention a candidate or have hashtags # related to a candidacy for s_obj in s_objs: party = s_obj['partido_politico'] movement = s_obj['movimiento'] tweet = s_obj['tweet_obj'] relevant_data = [] candidacy = '' # keep metadata related to the political party # (and movement) of the tweet (s_obj) for ida in interested_data: if ida['partido_politico'] == party: if movement != '': if ida['movimiento'] == movement: relevant_data.append(ida) else: relevant_data.append(ida) if len(relevant_data) > 0: # extract relevant information of the tweet. hashtags and mentions if # the tweet obj has these entities otherwise the text of the tweet if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_user_mentions = self.__get_screen_names( original_tweet['entities']['user_mentions']) t_hashtags = self.__get_hashtags( original_tweet['entities']['hashtags']) # see if the interested keywords are part of the tweet hashtags or mentions for rd in relevant_data: if rd['keyword'] in t_user_mentions: candidacy = rd['candidatura'] break else: if rd['keyword'] in t_hashtags: candidacy = rd['candidatura'] break else: if 'full_text' in original_tweet.keys(): t_text = tweet['full_text'] else: t_text = tweet['text'] # see if the interested keywords are present in the text for rd in relevant_data: if rd['keyword'] in t_text: candidacy = rd['candidatura'] break # fix candidacy key if candidacy: s_obj['candidatura'] = candidacy num_fixed_tweets += 1 self.__dbm.save_record(s_obj) return num_fixed_tweets
class UserPoliticalPreference: db_tweets, db_users = None, None def __init__(self): self.db_tweets = DBManager('tweets') self.db_users = DBManager('users') self.hashtags, self.metadata = self.__get_hashtags_and_metadata() def __get_hashtags_and_metadata(self): script_parent_dir = pathlib.Path(__file__).parents[1] config_fn = script_parent_dir.joinpath('config.json') configuration = get_config(config_fn) keywords, metadata = parse_metadata(configuration['metadata']) hashtags = [] for keyword in keywords: if '@' not in keyword: # The following hashtags are excluded because they are proper names of # movements and people if keyword not in ['HonorColorado', 'ColoradoAñetete', 'tuma']: hashtags.append(keyword.lower()) return hashtags, metadata def __get_tweet_hashtags(self, tweet_obj): tweet_hashtags = [] if 'entities' in tweet_obj.keys(): for hashtag in tweet_obj['entities']['hashtags']: tweet_hashtags.append(hashtag['text']) return tweet_hashtags def __get_hashtag_metadata(self, hashtag): for metadata in self.metadata: if metadata['keyword'].lower() == hashtag.lower(): return metadata def get_user_political_movement(self, user_screen_name): user_movement = None user_political_preference = defaultdict(int) filter = { 'relevante': { '$eq': 1 }, 'tweet_obj.user.screen_name': { '$eq': user_screen_name } } results = self.db_tweets.search(filter) for tweet in results: tweet_obj = tweet['tweet_obj'] if 'retweeted_status' in tweet_obj.keys(): tweet_hashtags = self.__get_tweet_hashtags( tweet_obj['retweeted_status']) else: tweet_hashtags = self.__get_tweet_hashtags(tweet_obj) for hashtag in tweet_hashtags: if hashtag.lower() in self.hashtags: hashtag_metadata = self.__get_hashtag_metadata(hashtag) if hashtag_metadata['movimiento']: user_political_preference[ hashtag_metadata['movimiento']] += 1 if user_political_preference: s_user_political_preference = [ k for k in sorted(user_political_preference.items(), key=lambda k_v: k_v[1], reverse=True) ] user_movement = s_user_political_preference[0][0] return user_movement def get_user_political_party(self, user_screen_name): user_party = None user_political_preference = defaultdict(int) filter = { 'relevante': { '$eq': 1 }, 'tweet_obj.user.screen_name': { '$eq': user_screen_name } } results = self.db_tweets.search(filter) for tweet in results: tweet_obj = tweet['tweet_obj'] if 'retweeted_status' in tweet_obj.keys(): tweet_hashtags = self.__get_tweet_hashtags( tweet_obj['retweeted_status']) else: tweet_hashtags = self.__get_tweet_hashtags(tweet_obj) for hashtag in tweet_hashtags: if hashtag.lower() in self.hashtags: hashtag_metadata = self.__get_hashtag_metadata(hashtag) if hashtag_metadata['partido_politico']: user_political_preference[ hashtag_metadata['partido_politico']] += 1 if user_political_preference: s_user_political_preference = [ k for k in sorted(user_political_preference.items(), key=lambda k_v: k_v[1], reverse=True) ] user_party = s_user_political_preference[0][0] return user_party def update_users_political_preference(self, include_movement=True): users = self.db_users.search({}) total_users = users.count() users_counter = 0 for user in users: users_counter += 1 user_movement, user_party = None, None logging.info('Processing {0}/{1} users'.format( users_counter, total_users)) if include_movement: user_movement = self.get_user_political_movement( user['screen_name']) user_party = self.get_user_political_party(user['screen_name']) logging.info('User {0} demonstrates to support {1}, {2}'.format( user['screen_name'], user_party, user_movement)) self.db_users.update_record({'screen_name': user['screen_name']}, { 'party': user_party, 'movement': user_movement }) def update_user_most_interacted_party_movement(self, include_movement=True): users = self.db_users.search({}) total_users = users.count() users_counter = 0 for user in users: users_counter += 1 user_most_interacted_movement, user_most_interacted_party = None, None logging.info('Processing {0}/{1} users'.format( users_counter, total_users)) if include_movement: user_interacted_movements = self.db_tweets.get_movement_user( user['screen_name']) if len(user_interacted_movements) > 0: user_most_interacted_movement = user_interacted_movements[ 0]['movimiento'] user_interacted_parties = self.db_tweets.get_party_user( user['screen_name']) if len(user_interacted_parties) > 0: user_most_interacted_party = user_interacted_parties[0][ 'partido'] self.db_users.update_record( {'screen_name': user['screen_name']}, { 'most_interacted_party': user_most_interacted_party, 'most_interacted_movement': user_most_interacted_movement }) def update_tweet_user_political_preference(self, include_movement=True): tweets = self.db_tweets.search({}) tweet_authors = defaultdict(dict) total_tweets = tweets.count() tweet_counter = 0 for tweet in tweets: tweet_counter += 1 logging.info('Processing {0}/{1} tweets'.format( tweet_counter, total_tweets)) tweet_obj = tweet['tweet_obj'] new_fields = {} if tweet_obj['user']['screen_name'] not in tweet_authors.keys(): user = self.db_users.search( {'screen_name': tweet_obj['user']['screen_name']}) try: new_fields['author_party'] = user[0]['party'] except IndexError: new_fields['author_party'] = None if include_movement: try: new_fields.update( {'author_movement': user[0]['movement']}) except IndexError: new_fields.update({'author_movement': None}) tweet_authors[tweet_obj['user']['screen_name']] = new_fields else: new_fields = tweet_authors[tweet_obj['user']['screen_name']] self.db_tweets.update_record( {'tweet_obj.id_str': tweet_obj['id_str']}, new_fields) def update_tweet_user_pbb(self): tweets = self.db_tweets.search({}) tweet_authors = defaultdict(dict) total_tweets = tweets.count() tweet_counter = 0 for tweet in tweets: tweet_counter += 1 logging.info('Processing {0}/{1} tweets'.format( tweet_counter, total_tweets)) tweet_obj = tweet['tweet_obj'] new_fields = {} if tweet_obj['user']['screen_name'] not in tweet_authors.keys(): user = self.db_users.search( {'screen_name': tweet_obj['user']['screen_name']}) try: new_fields['author_pbb'] = user[0]['bot_analysis']['pbb'] except IndexError: new_fields['author_party'] = -1 tweet_authors[tweet_obj['user']['screen_name']] = new_fields else: new_fields = tweet_authors[tweet_obj['user']['screen_name']] self.db_tweets.update_record( {'tweet_obj.id_str': tweet_obj['id_str']}, new_fields)
class UserInteractions: db_tweets, db_users = None, None def __init__(self): self.db_tweets = DBManager('tweets') self.db_users = DBManager('users') def __get_user_party(self, user_screen_name): user = self.db_users.search({'screen_name': user_screen_name}) try: party = user[0]['party'] return party if party else 'desconocido' except IndexError: return 'desconocido' def __get_user_movement(self, user_screen_name): user = self.db_users.search({'screen_name': user_screen_name}) try: movement = user[0]['movement'] if movement: return movement else: return 'desconocido' except IndexError: return 'desconocido' def __user_belong_party_movement(self, party, movement, tweet_author, tweet_authors): if not party and not movement: return True, tweet_authors party = party.lower() movement = movement.lower() if tweet_authors[tweet_author]: tweet_author_party, tweet_author_movement = tweet_authors[tweet_author]['party'], \ tweet_authors[tweet_author]['movement'] else: tweet_author_party, tweet_author_movement = 'desconocido', 'desconocido' if party and tweet_author_party == 'desconocido': logging.info( 'Need to get the party of the user {0}'.format(tweet_author)) tweet_author_party = self.__get_user_party(tweet_author) if tweet_authors[tweet_author]: tweet_authors[tweet_author]['party'] = tweet_author_party else: tweet_authors[tweet_author] = { 'party': tweet_author_party, 'movement': 'desconocido' } if movement and tweet_author_movement == 'desconocido': logging.info('Need to get the movement of the user {0}'.format( tweet_author)) tweet_author_movement = self.__get_user_movement(tweet_author) if tweet_authors[tweet_author]: tweet_authors[tweet_author]['movement'] = tweet_author_movement else: tweet_authors[tweet_author] = { 'party': 'desconocido', 'movement': tweet_author_movement } if party and movement: if party == tweet_author_party and movement == tweet_author_movement: return True, tweet_authors else: return False, tweet_authors else: if party: if party == tweet_author_party: return True, tweet_authors else: return False, tweet_authors else: if movement: if movement == tweet_author_movement: return True, tweet_authors else: return False, tweet_authors else: return False, tweet_authors def __get_mentions_in_tweet(self, tweet_obj): user_mentions = [] if 'entities' in tweet_obj.keys(): for mention in tweet_obj['entities']['user_mentions']: user_mentions.append(mention['screen_name']) return user_mentions def __process_tweet(self, tweet, type_tweet, interactions): for interaction in interactions: if interaction['date'] == tweet['tweet_py_date'] and interaction[ 'type'] == type_tweet: interaction['count'] += 1 return interactions interactions.append({ 'date': tweet['tweet_py_date'], 'type': type_tweet, 'count': 1 }) return interactions def get_inter_received_user(self, user_screen_name, party=None, movement=None, exclude_tweet=None): tweets = self.db_tweets.search({}) interactions_user = [] tweet_authors = defaultdict(dict) total_tweets = tweets.count() tweet_counter = 0 for tweet in tweets: tweet_counter += 1 logging.info('Processing {0}/{1} tweets'.format( tweet_counter, total_tweets)) tweet_obj = tweet['tweet_obj'] # discard tweets posted by the given user if tweet_obj['user']['screen_name'] == user_screen_name: continue # discard tweet users that do not belong to the given party and movement belong_party_movement, tweet_authors = self.__user_belong_party_movement( party, movement, tweet_obj['user']['screen_name'], tweet_authors) if not belong_party_movement: continue # process replies to the given user if tweet_obj['in_reply_to_screen_name'] == user_screen_name: if not exclude_tweet or (exclude_tweet and tweet_obj['in_reply_to_status_id'] != exclude_tweet): interactions_user = self.__process_tweet( tweet, 'reply', interactions_user) # process quotes to the given user elif 'quoted_status' in tweet_obj.keys( ) and tweet_obj['quoted_status']['user'][ 'screen_name'] == user_screen_name: if not exclude_tweet or ( exclude_tweet and tweet_obj['quoted_status']['id_str'] != exclude_tweet): interactions_user = self.__process_tweet( tweet, 'quote', interactions_user) # process retweets to the given user's tweets elif 'retweeted_status' in tweet_obj.keys( ) and tweet_obj['retweeted_status']['user'][ 'screen_name'] == user_screen_name: if not exclude_tweet or ( exclude_tweet and tweet_obj['retweeted_status']['id_str'] != exclude_tweet): interactions_user = self.__process_tweet( tweet, 'retweet', interactions_user) # process mentions to the given user if the tweet is not a reply. we are interested in # original tweets that include the mention to the given user not in replies that by default # include the screen name of the given user else: tweet_mentions = self.__get_mentions_in_tweet(tweet_obj) if user_screen_name in tweet_mentions: interactions_user = self.__process_tweet( tweet, 'mention', interactions_user) return interactions_user
class SentimentAnalysis: config_file_name = pathlib.Path(__file__).parents[1].joinpath( 'config.json') config = None language = '' method = '' __db = None def __init__(self, collection='tweets', language='spanish'): self.config = get_config(self.config_file_name) self.language = language self.__dbm = DBManager(collection) def __get_analyzed_tweet(self, analyzed_tweets, id_tweet_to_search): for analyzed_tweet in analyzed_tweets: if id_tweet_to_search == analyzed_tweet['id']: return analyzed_tweet return None def update_sentiment_of_non_original_tweets(self, query={}, update_sentiment=False): if update_sentiment: query.update({ 'relevante': 1, }) else: query.update({'relevante': 1, 'sentimiento': {'$exists': 0}}) tweet_regs = self.__dbm.search(query) rts_wo_tw = [] for tweet_reg in tweet_regs: if 'retweeted_status' in tweet_reg['tweet_obj'].keys(): id_original_tweet = tweet_reg['tweet_obj']['retweeted_status'][ 'id_str'] original_tweet_reg = self.__dbm.find_record( {'tweet_obj.id_str': id_original_tweet}) if original_tweet_reg: sentiment_ot = original_tweet_reg['sentimiento'] if sentiment_ot: self.__dbm.update_record( { 'tweet_obj.id_str': tweet_reg['tweet_obj']['id_str'] }, {'sentimiento': sentiment_ot}) else: raise Exception( 'Error, found an original tweet without sentiment') else: rts_wo_tw.append(tweet_reg['tweet_obj']) elif tweet_reg['tweet_obj']['in_reply_to_status_id_str']: rts_wo_tw.append(tweet_reg['tweet_obj']) logging.info('Tweet not RT {0}'.format( tweet_reg['tweet_obj']['id_str'])) self.__analyze_sentiment_of_rt_wo_tws(rts_wo_tw) def __update_sentimient_rts(self, analyzed_tweets): for analyzed_tweet in analyzed_tweets: # search rts of the analyzed tweet rts = self.__dbm.search( {'tweet_obj.retweeted_status.id_str': analyzed_tweet['id']}) for rt in rts: self.__dbm.update_record( {'tweet_obj.id_str': rt['tweet_obj']['id_str']}, {'sentimiento': analyzed_tweet['sentimiento']}) def __analyze_sentiment_of_rt_wo_tws(self, tweets): tot_tws = len(tweets) batch_size = 5 tweets_to_analyze = [] for current_tw in range(tot_tws): tweet_id = tweets[current_tw]['id_str'] if 'retweeted_status' in tweets[current_tw].keys(): tweet = tweets[current_tw]['retweeted_status'] else: tweet = tweets[current_tw] if 'full_text' in tweet.keys(): tweet_text = tweet['full_text'] else: tweet_text = tweet['text'] if len(tweets_to_analyze) < batch_size and current_tw < tot_tws: tweets_to_analyze.append({'id': tweet_id, 'text': tweet_text}) if len(tweets_to_analyze) < batch_size and current_tw < ( tot_tws - 1): continue sentiment_results = self.do_sentiment_analysis(tweets_to_analyze) tweets_to_analyze = [] for sentiment_result in sentiment_results: sentiment_info = sentiment_result['sentimiento'] tweet_id = sentiment_result['id'] tweet_text = sentiment_result['text'] self.__dbm.update_record({'tweet_obj.id_str': tweet_id}, {'sentimiento': sentiment_info}) logging.debug('Tweet text: {0}, Sentimiento: {1} ({2})'.format( tweet_text.encode('utf-8'), sentiment_info['tono'], sentiment_info['score'])) def analyze_sentiments(self, query={}, update_sentiment=False): """ :param query: dictionary of <key, value> terms to be used in querying the db """ if update_sentiment: query.update({ 'relevante': 1, 'tweet_obj.retweeted_status': { '$exists': 0 } }) else: query.update({ 'relevante': 1, 'tweet_obj.retweeted_status': { '$exists': 0 }, 'sentimiento': { '$exists': 0 } }) tweet_regs = self.__dbm.search(query) analyzed_tweets = [] tot_reg = tweet_regs.count() logging.info( 'Going to analyze the sentiment of {0} tweets, ' 'it can take a lot of time, be patient...'.format(tot_reg)) batch_size = 100 total_batches = ceil(tot_reg / batch_size) batch = 0 tweets_to_analyze = [] try: for current_reg in range(tot_reg): tweet_reg = tweet_regs[current_reg] tweet = tweet_reg['tweet_obj'] if 'full_text' in tweet.keys(): tweet_text = tweet['full_text'] else: tweet_text = tweet['text'] if len(tweets_to_analyze ) < batch_size and current_reg < tot_reg: tweets_to_analyze.append({ 'id': tweet['id_str'], 'text': tweet_text }) if len(tweets_to_analyze) < batch_size: continue batch += 1 logging.info( 'Analyzing the sentiment of {0} tweets in batch {1}/{2} ' 'out of {3} tweets...'.format(len(tweets_to_analyze), batch, total_batches, tot_reg)) sentiment_results = self.do_sentiment_analysis( tweets_to_analyze) logging.info( 'Finished analyzing the sentiment of {0} tweets in batch {1}/{2} ' 'out of {3} tweets...'.format(len(tweets_to_analyze), batch, total_batches, tot_reg)) logging.info('Updating sentiment scores in database...') tweets_to_analyze = [] for sentiment_result in sentiment_results: sentiment_info = sentiment_result['sentimiento'] tweet_id = sentiment_result['id'] tweet_text = sentiment_result['text'] self.__dbm.update_record({'tweet_obj.id_str': tweet_id}, {'sentimiento': sentiment_info}) analyzed_tweets.append({ 'id': tweet_id, 'texto': tweet_text, 'sentimiento': sentiment_info }) logging.debug( 'Tweet text: {0}, Sentimiento: {1} ({2})'.format( tweet_text.encode('utf-8'), sentiment_info['tono'], sentiment_info['score'])) except Exception as e: logging.error(e) finally: self.__update_sentimient_rts(analyzed_tweets) return analyzed_tweets def do_sentiment_analysis(self, tweets): sa = SentimentAnalyzer(language='spanish') tweet_texts = [] for tweet in tweets: tweet_texts.append(tweet['text'] + ' -$%#$&- {0}'.format(tweet['id'])) sa.analyze_docs(tweet_texts) results = sa.tagged_docs logging.info( 'Finished the sentiment analysis, now {0} results are going to ' 'be processed...'.format(len(results))) ret = self.__process_results(results) logging.info('Computed correctly the sentiment of {0} tweets'.format( len(tweet_texts))) return ret def remote_sentiment_analysis(self, tweets): accepted_codes = [200, 201, 202] error_codes = [400, 401] url_base = 'http://159.203.77.35:8080/api' url_sentiment = url_base + '/analysis/sentiment-analysis/' url_auth = url_base + '/auth/' headers = {'Authorization': 'JWT ' + self.config['inhouse']['api_key']} tweet_texts = [] for tweet in tweets: tweet_texts.append(tweet['text'] + ' -$%#$&- {0}'.format(tweet['id'])) parameters = { 'neu_inf_lim': -0.3, 'neu_sup_lim': 0.3, 'language': 'spanish' } data = { 'name': (None, 'politic-bots'), 'parameters': (None, json.dumps(parameters), 'application/json'), 'data_object': (None, json.dumps(tweet_texts), 'application/json') } ret = [] logging.info('Computing the sentiment of {0} tweets'.format( len(tweet_texts))) resp = requests.post(url_sentiment, headers=headers, files=data) if resp.status_code in error_codes: # have to renew the api token body_auth = { 'username': self.config['inhouse']['username'], 'password': self.config['inhouse']['password'] } resp = requests.post(url_auth, data=body_auth) if resp.status_code in accepted_codes: resp_json = resp.json() api_token = resp_json['token'] self.config['inhouse']['api_key'] = api_token update_config(self.config_file_name, self.config) resp = requests.post(url_sentiment, headers=headers, files=data) else: raise Exception( 'Error {0} when trying to renew the token of the api'. format(resp.status_code)) if resp.status_code in accepted_codes: resp_json = resp.json() get_url = url_sentiment + str(resp_json['id']) + '/' results = [] # wait some time before trying to get # the results time.sleep(60) while len(results) == 0: # wait some time before trying to # get the results time.sleep(30) resp = requests.get(get_url, headers=headers) if resp.status_code in accepted_codes: resp_json = resp.json() results = json.loads(resp_json['result']) else: raise Exception( 'Got an unexpected response, code: {0}'.format( resp.status_code)) logging.info( 'Obtained the results of sentiment analysis, now the results are going to be processed...' ) ret = self.__process_results(results) else: logging.error( 'Error {0} when trying to compute the sentiment of the tweets'. format(resp.status_code)) logging.info('Computed correctly the sentiment of {0} tweets'.format( len(tweet_texts))) return ret def __process_results(self, results): ret = [] for result in results: text, tone, score = result if tone == 'neg': sentiment = 'negative' elif tone == 'pos': sentiment = 'positive' else: sentiment = 'neutral' tw_text_id = text.split('-$%#$&-') id_tweet = tw_text_id[1].strip() text_tweet = tw_text_id[0].strip() dic_ret = { 'id': id_tweet, 'text': text_tweet, 'sentimiento': { 'tono': sentiment, 'score': score } } ret.append(dic_ret) return ret
class BotDetector: __dbm_tweets = None __dbm_users = None __api = None def __init__(self): self.__dbm_tweets = DBManager('tweets') self.__dbm_users = DBManager('users') name_config_file = pathlib.Path(__file__).parents[1].joinpath('config.json') conf = get_config(name_config_file) auth = tweepy.AppAuthHandler(conf['twitter']['consumer_key'], conf['twitter']['consumer_secret']) self.__api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) def __save_user_pbb(self, user_screen_name, pbb, bot_score, user_bot_features, num_heuristics, sum_weights, exist_user): new_fields = { 'exists': int(exist_user), 'bot_analysis': {'features': user_bot_features, 'pbb': pbb, 'raw_score': bot_score, 'num_evaluated_heuristics': num_heuristics, 'sum_weights': sum_weights} } self.__dbm_users.update_record({'screen_name': user_screen_name}, new_fields) def __check_if_user_exists(self, user_screen_name): user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0] if 'exists' in user_obj.keys(): return int(user_obj['exists']) else: try: self.__api.get_user(user_screen_name) return True except tweepy.TweepError: return False def __compute_bot_formula(self, user_bot_features, exists_user): name_weights_file = pathlib.Path(__file__).parents[0].joinpath('heuristic_weights.json') weights_file = get_config(name_weights_file) sum_heuristic_values = 0 sum_weights = 0 for feature_name in user_bot_features.keys(): feature_weight = weights_file[feature_name] feature_value = user_bot_features[feature_name]['value'] sum_heuristic_values += feature_weight * feature_value sum_weights += feature_weight sum_heuristic_values += weights_file['exists'] * (1-int(exists_user)) sum_weights += weights_file['exists'] return sum_heuristic_values, sum_weights, sum_heuristic_values/sum_weights def __get_timeline(self, user_screen_name, user_tweets): """ Get the last 100 tweets in the timeline of a given user :param user: user from whom her timeline should be obtained from :return: user's timeline """ user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0] if 'timeline' in user_obj.keys(): return user_obj['timeline'] logging.info('Get the last 100 tweets from Twitter') timeline = [] try: for status in tweepy.Cursor(self.__api.user_timeline, screen_name=user_screen_name).items(100): timeline.append(status._json) # save the not electoral tweets of the user's timeline id_electoral_tweets = [tweet['id_str'] for tweet in user_tweets] timeline_tweets_to_save = [tweet for tweet in timeline if tweet['id_str'] not in id_electoral_tweets] logging.info('To save {0} not electoral tweets of {1}'.format(len(timeline_tweets_to_save), user_screen_name)) new_field = { 'timeline': timeline_tweets_to_save } self.__dbm_users.update_record({'screen_name': user_screen_name}, new_field) except tweepy.TweepError: pass return timeline def __get_tweets_user(self, user_screen_name): user_tweets_obj = self.__dbm_tweets.search({'tweet_obj.user.screen_name': user_screen_name}) user_tweets = [user_tweet_obj['tweet_obj'] for user_tweet_obj in user_tweets_obj] return user_tweets def __get_user_info_from_twitter(self, user_screen_name): user_twitter_obj = None try: user_twitter_obj = self.__api.get_user(user_screen_name) except tweepy.TweepError: pass return user_twitter_obj._json def __get_computed_heuristics(self, user_screen_name): user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0] if 'bot_analysis' in user_obj.keys(): return user_obj['bot_analysis']['features'] else: return {} def __compute_heuristics(self, user_screen_name, recompute_heuristics=False): logging.info('\n\nComputing the probability of being bot of the user: {0}\n\n'.format(user_screen_name)) # Get tweets of the user user_tweets = self.__get_tweets_user(user_screen_name) # Check if the user still exists on Twitter exist_user = self.__check_if_user_exists(user_screen_name) user_timeline = None if exist_user: # If the user still exists on Twitter, get her timeline user_timeline = self.__get_timeline(user_screen_name, user_tweets) # Get the information about the user and her tweets user_obj = get_user(self.__dbm_tweets, user_screen_name) if not user_obj: user_obj = self.__get_user_info_from_twitter(user_screen_name) if not user_obj: raise Exception('Error!, Cannot fetch information about the user {0}'.format(user_screen_name)) if user_obj['verified']: # It is a verified account, it cannot be bot logging.info('The user {0} is an account verified by Twitter, it cannot be a bot'.format(user_screen_name)) self.__save_user_pbb(user_screen_name, 0, 0, None, 0, 0, exist_user) return # Get the computed heuristics user_bot_features = self.__get_computed_heuristics(user_screen_name) if user_bot_features: user_computed_heuristics = user_bot_features.keys() else: user_computed_heuristics = [] if recompute_heuristics or 'retweet_electoral' not in user_computed_heuristics: if user_tweets: # Compute the percentage of retweets in the electoral tweets per_rt = is_retweet_bot(user_tweets) user_bot_features['retweet_electoral'] = { 'value': per_rt } if recompute_heuristics or 'reply_electoral' not in user_computed_heuristics: if user_tweets: # Compute the percentage of replies in the electoral tweets per_rp = reply_percentage(user_tweets) user_bot_features['reply_electoral'] = { 'value': per_rp } if recompute_heuristics or 'retweet_timeline' not in user_computed_heuristics: # Compute the percentage of retweets in the user's timeline if user_timeline: per_rt = is_retweet_bot(user_timeline) user_bot_features['retweet_timeline'] = { 'value': per_rt } if recompute_heuristics or 'reply_timeline' not in user_computed_heuristics: if user_timeline: per_rp = reply_percentage(user_timeline) user_bot_features['reply_timeline'] = { 'value': per_rp } if recompute_heuristics or 'creation_date' not in user_computed_heuristics: # Check the user's creation year extraction_date = self.__dbm_tweets.find_record({})['extraction_date'] electoral_year = int('20' + extraction_date.split('/')[2]) user_bot_features['creation_date'] = { 'value': creation_date(parse_date(user_obj['created_at']), electoral_year) } if recompute_heuristics or 'default_profile' not in user_computed_heuristics: # Check if the user has default profile. user_bot_features['default_profile'] = { 'value': default_profile(user_obj) } if recompute_heuristics or 'default_profile_picture' not in user_computed_heuristics: # Check if the user has default profile picture user_bot_features['default_profile_picture'] = { 'value': default_profile_picture(user_obj) } if recompute_heuristics or 'default_background' not in user_computed_heuristics: # Check if the user has default background picture user_bot_features['default_background'] = { 'value': default_background(user_obj) } if recompute_heuristics or 'empty_description' not in user_computed_heuristics: # Check if the user has a biography description user_bot_features['empty_description'] = { 'value': default_description(user_obj) } if recompute_heuristics or 'location' not in user_computed_heuristics: # Check if the user has location user_bot_features['location'] = { 'value': location(user_obj) } if recompute_heuristics or 'ff_ratio' not in user_computed_heuristics: # Check the user's following followers ratio ratio = followers_ratio(user_obj) user_bot_features['ff_ratio'] = { 'value': ratio } if recompute_heuristics or 'random_letters' not in user_computed_heuristics: rl_value = random_account_letter(user_obj) user_bot_features['random_letters'] = { 'value': rl_value } if recompute_heuristics or 'random_numbers' not in user_computed_heuristics: rn_value = random_account_number(user_obj) user_bot_features['random_numbers'] = { 'value': rn_value } if recompute_heuristics or 'similar_account' not in user_computed_heuristics: similarity_score = similar_account_name(user_obj, self.__dbm_users, self.__dbm_tweets) user_bot_features['similar_account'] = { 'value': similarity_score } # Compute the user's probability of being bot num_computed_heuristics = len(user_bot_features.keys()) bot_score, sum_weights, pbb = self.__compute_bot_formula(user_bot_features, exist_user) self.__save_user_pbb(user_screen_name, pbb, bot_score, user_bot_features, num_computed_heuristics, sum_weights, exist_user) logging.info('\n\nThe bot score of {0} is {1}\n\n'.format(user_screen_name, bot_score)) return def compute_fake_promoter_heuristic(self, users): name_weights_file = pathlib.Path(__file__).parents[0].joinpath('heuristic_weights.json') weights_file = get_config(name_weights_file) if not users: users = self.__dbm_users.search({'bot_analysis.features.fake_promoter': {'$exists': 0}, 'verified': {'$ne': True}}) tot_user = users.count() idx_user = 1 for user in users: logging.info('Remaining users: {0}'.format(tot_user - idx_user)) user_screen_name = user['screen_name'] user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0] user_bot_features = user_obj['bot_analysis']['features'] # Check if the user interacts with bot accounts fp = fake_promoter(user_screen_name, self.__dbm_users) logging.info('User: {0}, fake promoter score: {1}'.format(user_screen_name, fp)) if not user_bot_features: user_bot_features = {} user_bot_features['fake_promoter'] = { 'value': fp } bot_score = user_obj['bot_analysis']['raw_score'] bot_score += user_bot_features['fake_promoter']['value'] * weights_file['fake_promoter'] heuristics = user_obj['bot_analysis']['num_evaluated_heuristics'] + 1 sum_weights = user_obj['bot_analysis']['sum_weights'] + weights_file['fake_promoter'] pbb = bot_score/sum_weights exist_user = user_obj['exists'] self.__save_user_pbb(user_screen_name, pbb, bot_score, user_bot_features, heuristics, sum_weights, exist_user) idx_user += 1 def compute_bot_probability(self, users, source_users_collection = "", source_users_db = ""): reusers_db = None if source_users_db and source_users_collection: reusers_db = DBManager(source_users_collection, source_users_db) if not users: # Get all users who don't have the analysis of bot in current user users = self.__dbm_users.search({'bot_analysis': {'$exists': 0}}) tot_user = len(users) if type(users) == list else users.count() idx_user = 1 for user in users: logging.info('Remaining users: {0}'.format(tot_user-idx_user)) if reusers_db: reuser_cursor = reusers_db.search({'screen_name': user['screen_name']}) if reuser_cursor.count() > 0: logging.info('Reusing bot analysis from another DB for {0}'.format(user['screen_name'])) reuser = reuser_cursor[0] bot_analysis = reuser['bot_analysis'] self.__save_user_pbb(reuser['screen_name'], bot_analysis['pbb'], bot_analysis['raw_score'], bot_analysis['features'], bot_analysis['num_evaluated_heuristics'], bot_analysis['sum_weights'], reuser['exists']) continue if type(users) == list: user_screen_name = user else: user_screen_name = user['screen_name'] self.__compute_heuristics(user_screen_name) idx_user += 1 def to_csv(self, output_file_name, include_verified_accounts=True): if not include_verified_accounts: query = {'bot_analysis': {'$exists': 1}, 'verified': {'$ne': True}} else: query = {'bot_analysis': {'$exists': 1}} users = self.__dbm_users.search(query) f_name = str(pathlib.Path(__file__).parents[2].joinpath('reports',output_file_name)) logging.info('Saving bot analysis into the csv file {0}'.format(f_name)) with open(f_name, 'w', encoding='utf-8') as f: user_info_fields = ['screen_name', 'profile_url', 'party', 'movement', 'exists', 'followers', 'friends', 'tweets', 'rts', 'rps', 'verified'] bot_analysis_fields = ['location', 'default_profile_picture', 'retweet_electoral', 'default_background', 'similar_account', 'random_numbers', 'ff_ratio', 'random_letters', 'default_profile', 'creation_date', 'empty_description', 'retweet_timeline', 'reply_electoral', 'reply_timeline', 'fake_promoter', 'raw_score', 'sum_weights', 'pbb'] writer = csv.DictWriter(f, fieldnames=user_info_fields+bot_analysis_fields) writer.writeheader() tot_users = users.count() logging.info('Going to save the information of the bot analysis of {0} users'.format(tot_users)) idx_user = 1 for user in users: logging.info('Remaining users: {0}'.format(tot_users - idx_user)) row_dict = {} for field_name in bot_analysis_fields: if field_name in user['bot_analysis']['features'].keys(): row_dict[field_name] = user['bot_analysis']['features'][field_name]['value'] elif field_name in user['bot_analysis'].keys(): row_dict[field_name] = user['bot_analysis'][field_name] for field_name in user_info_fields: if field_name == 'profile_url': continue row_dict[field_name] = user[field_name] if user['exists']: row_dict['profile_url'] = 'https://twitter.com/' + user['screen_name'] else: row_dict['profile_url'] = ' ' writer.writerow(row_dict) idx_user += 1 logging.info('The saving process has finished, please check the file {0}'.format(f_name))