def __init__(self): self.__dbm_tweets = DBManager('tweets') self.__dbm_users = DBManager('users') name_config_file = pathlib.Path(__file__).parents[1].joinpath('config.json') conf = get_config(name_config_file) auth = tweepy.AppAuthHandler(conf['twitter']['consumer_key'], conf['twitter']['consumer_secret']) self.__api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
def save_original_tweets_file(): dbm = DBManager('tweets') query = { 'tweet_obj.retweeted_status': { '$exists': 0 }, 'sentimiento': { '$exists': 1 }, } s_objs = dbm.search(query) with open('tweet_sentiments.csv', 'w', encoding='utf-8') as f_csv: fieldnames = ['id', 'text', 'tone', 'score'] writer = csv.DictWriter(f_csv, fieldnames=fieldnames) writer.writeheader() for s_obj in s_objs: tweet = s_obj['tweet_obj'] if 'full_text' in tweet.keys(): tweet_text = tweet['full_text'] else: tweet_text = tweet['text'] tweet_text = clean_emojis(tweet_text) tweet_text = tweet_text.replace('\r', '') tweet_text = tweet_text.replace('\n', '') tweet_text = tweet_text.replace(',', '') tweet_dict = { 'id': tweet['id_str'], 'text': tweet_text, 'tone': s_obj['sentimiento']['tono'], 'score': s_obj['sentimiento']['score'] } writer.writerow(tweet_dict)
def __db_trustworthy_users(db_users, db_tweets, config): """ Generate a database of trustworthy users. We trust in the user if she has a verified account or has more than X number of followers :param db_users: database of user :param config: dictionary with the configuration parameters of the heuristic :return: database of trustworthy users """ trustworthy_users_db = DBManager('trustworthy_users') if trustworthy_users_db.num_records_collection() == 0: logging.info('The trustworthy_users collection is being created...') for doc in db_users.find_all(): data = get_user(db_tweets, doc['screen_name']) if data['verified'] or int( data['followers_count']) > config['min_num_followers']: if not trustworthy_users_db.find_record( {'screen_name': data['screen_name']}): trustworthy_users_db.save_record({ 'screen_name': doc['screen_name'], 'name': data['name'], 'created_at': data['created_at'], 'followers_count': data['followers_count'], 'verified': data['verified'] }) return trustworthy_users_db
def compute_bot_probability(self, users, source_users_collection = "", source_users_db = ""): reusers_db = None if source_users_db and source_users_collection: reusers_db = DBManager(source_users_collection, source_users_db) if not users: # Get all users who don't have the analysis of bot in current user users = self.__dbm_users.search({'bot_analysis': {'$exists': 0}}) tot_user = len(users) if type(users) == list else users.count() idx_user = 1 for user in users: logging.info('Remaining users: {0}'.format(tot_user-idx_user)) if reusers_db: reuser_cursor = reusers_db.search({'screen_name': user['screen_name']}) if reuser_cursor.count() > 0: logging.info('Reusing bot analysis from another DB for {0}'.format(user['screen_name'])) reuser = reuser_cursor[0] bot_analysis = reuser['bot_analysis'] self.__save_user_pbb(reuser['screen_name'], bot_analysis['pbb'], bot_analysis['raw_score'], bot_analysis['features'], bot_analysis['num_evaluated_heuristics'], bot_analysis['sum_weights'], reuser['exists']) continue if type(users) == list: user_screen_name = user else: user_screen_name = user['screen_name'] self.__compute_heuristics(user_screen_name) idx_user += 1
def create_week_view(name, source, start_date, end_date, config_file): dbm = DBManager(config_fn=config_file) if not start_date: start_date = datetime.today().strftime('%Y-%m-%d') query = [{'$match': {'created_at_date': {'$gte': start_date}}}] if end_date: query[0]['$match']['created_at_date'].update({'$lte': end_date}) dbm.create_view(name, source, query)
def run_bot_detector(users, reusedb, fakepromoter): bot_detector = BotDetector() # create database of user if it doesn't exist users_db = DBManager('users') if users_db.num_records_collection() == 0: na = NetworkAnalyzer() na.create_users_db() bot_detector.compute_bot_probability(users, 'users', reusedb) if fakepromoter: bot_detector.compute_fake_promoter_heuristic(users)
def fix_tweets_with_empty_flags(): dbm = DBManager('tweets') script_parent_dir = pathlib.Path(__file__).parents[1] conf_file = script_parent_dir.joinpath('config.json') configuration = get_config(conf_file) keyword, k_metadata = parse_metadata(configuration['metadata']) tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1}) for tweet in tweets_with_empty_flags: logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str'])) flag, headers = create_flag(k_metadata) entities = get_entities_tweet(tweet['tweet_obj']) flag = add_values_to_flags(flag, entities, k_metadata) dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag) #if __name__ == '__main__': # fix_tweets_with_empty_flags()
def do_tweet_collection(): script_parent_dir = pathlib.Path(__file__).parents[0] conf_file = script_parent_dir.joinpath('config.json') configuration = get_config(conf_file) credentials = {'key': configuration['twitter']['consumer_key'], 'secret': configuration['twitter']['consumer_secret']} keyword, k_metadata = parse_metadata(configuration['metadata']) dbm = DBManager('tweets') tm = TwitterAPIManager(credentials, dbm) for current_keyword, keyword_row in zip(keyword, k_metadata): logging.info('Searching tweets for %s' % current_keyword) if '@' in current_keyword: tm.search_tweets(configuration['tweets_qry'], current_keyword, 'user', k_metadata) else: tm.search_tweets(configuration['tweets_qry'], current_keyword, 'hashtag', k_metadata) logging.info('Evaluating the relevance of the new tweets...') te = TweetEvaluator() te.identify_relevant_tweets()
def compute_tweets_local_date(force_computation=False, include_hour=False): dbm = DBManager('tweets') if force_computation: query = {} else: query = {'tweet_py_datetime': {'$exists': 0}} s_objs = dbm.search(query, only_relevant_tws=False) for s_obj in s_objs: tweet = s_obj['tweet_obj'] py_pub_dt = get_py_date(tweet) dict_to_update = { 'tweet_py_datetime': datetime.strftime(py_pub_dt, '%m/%d/%y %H:%M:%S'), 'tweet_py_date': datetime.strftime(py_pub_dt, '%m/%d/%y') } if include_hour: dict_to_update.update( {'tweet_py_hour': datetime.strftime(py_pub_dt, '%H')}) dbm.update_record({'tweet_obj.id_str': tweet['id_str']}, dict_to_update) return
def __init__(self): self.__dbm_tweets = DBManager('tweets') self.__dbm_users = DBManager('users') self.__dbm_networks = DBManager('networks') self.__network = []
def add_video_property(use_video_config_api=False, user_bearer=None): db = DBManager('tweets') plain_tweets = db.get_plain_tweets() tot_plain_tweets = len(plain_tweets) logging.info('Plain tweets {0}'.format(tot_plain_tweets)) tweet_counter = 0 if not use_video_config_api: driver = webdriver.Chrome() for plain_tweet in plain_tweets: tweet_counter += 1 response = None # video_config_api_response = None # if 'video_config_api' in plain_tweet.keys(): # video_config_api_response = plain_tweet['video_config_api']['is_video_response'] if 'video_embed_url' in plain_tweet.keys(): video_config_api_response = plain_tweet['video_config_api'][ 'is_video_response'] logging.info('Remaining tweets: {0}'.format(tot_plain_tweets - tweet_counter)) id_tweet = plain_tweet['tweet_obj']['id_str'] found_message = False method = "video_config_api" result_value = None result_status = None result_headers = None previous_responses = {} previous_responses['noexist'] = "Sorry, that page does not exist" previous_responses['empty'] = "b''" previous_responses['limit'] = "Rate limit exceeded" previous_responses['nomedia'] = "The media could not be played" # proceed = False # if video_config_api_response: # if video_config_api_response.__contains__(previous_responses['noexist']) or video_config_api_response == previous_responses['empty'] or video_config_api_response.__contains__(previous_responses['limit']): # logging.info('Processing tweet that got response: {0}'.format(video_config_api_response)) # proceed = True # # if not proceed: # continue if not use_video_config_api: method = "video_embed_url" video_url = 'https://twitter.com/i/videos/' url = video_url + id_tweet driver.get(url) time.sleep(10) spans = driver.find_elements_by_tag_name('span') span_texts = [span.text for span in spans] result_value = str(span_texts) for span_text in span_texts: if span_text == 'The media could not be played.': found_message = True break else: import http.client response = get_video_config_with_user_bearer(user_bearer, id_tweet) curr_rate_limit_remaining_header = response.headers[ 'x-rate-limit-remaining'] curr_rate_limit_remaining = 0 if curr_rate_limit_remaining_header: curr_rate_limit_remaining = int( curr_rate_limit_remaining_header) curr_time = calendar.timegm(time.gmtime()) curr_rate_limit_expiration_header = response.headers[ 'x-rate-limit-reset'] curr_rate_limit_expiration = curr_time if curr_rate_limit_expiration_header: curr_rate_limit_expiration = int( curr_rate_limit_expiration_header) seconds_until_expiration = curr_rate_limit_expiration - curr_time result_value = str(response.read()) result_headers = str(response.headers) result_status = str(response.status) if response.status != http.client.OK: found_message = True if curr_rate_limit_remaining == 0: logging.info( '\n\nProcessed {0} tweets Twitter API rate limit exceeded. Waiting for {1} seconds' .format(tweet_counter, seconds_until_expiration + 1)) time.sleep(seconds_until_expiration + 1) update_object = {} if found_message: logging.info( '\n\nThe tweet {0} DOES NOT have a video! Response STATUS = \n{1}, HEADERS = \n{2}, \nBODY = {3} \n' .format(id_tweet, result_status, result_headers, result_value)) update_object[method] = { 'is_video': 0, 'is_video_response': result_value } db.update_record({'tweet_obj.id_str': id_tweet}, update_object) else: logging.info( '\n\nThe tweet {0} HAS a video! Response STATUS = {1}, HEADERS = {2} \n' .format(id_tweet, result_status, result_headers)) update_object[method] = { 'is_video': 1, 'is_video_response': result_value } db.update_record({'tweet_obj.id_str': id_tweet}, update_object)
def __init__(self): self.user_handlers, self.hashtags = get_user_handlers_and_hashtags() self.__dbm = DBManager('tweets')
def remove_view(name, config_file): dbm = DBManager(config_fn=config_file) dbm.drop_collection(name)
def __init__(self): self.db_tweets = DBManager('tweets') self.db_users = DBManager('users') self.hashtags, self.metadata = self.__get_hashtags_and_metadata()
def __init__(self): self.db_tweets = DBManager('tweets') self.db_users = DBManager('users')
def __init__(self, collection='tweets', language='spanish'): self.config = get_config(self.config_file_name) self.language = language self.__dbm = DBManager(collection)