Exemple #1
0
 def __get_hashtags_and_metadata(self):
     script_parent_dir = pathlib.Path(__file__).parents[1]
     config_fn = script_parent_dir.joinpath('config.json')
     configuration = get_config(config_fn)
     keywords, metadata = parse_metadata(configuration['metadata'])
     hashtags = []
     for keyword in keywords:
         if '@' not in keyword:
             # The following hashtags are excluded because they are proper names of
             # movements and people
             if keyword not in ['HonorColorado', 'ColoradoAñetete', 'tuma']:
                 hashtags.append(keyword.lower())
     return hashtags, metadata
Exemple #2
0
def fix_tweets_with_empty_flags():
    dbm = DBManager('tweets')
    script_parent_dir = pathlib.Path(__file__).parents[1]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1})
    for tweet in tweets_with_empty_flags:
        logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str']))
        flag, headers = create_flag(k_metadata)
        entities = get_entities_tweet(tweet['tweet_obj'])
        flag = add_values_to_flags(flag, entities, k_metadata)
        dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag)


#if __name__ == '__main__':
#    fix_tweets_with_empty_flags()
Exemple #3
0
def do_tweet_collection():
    script_parent_dir = pathlib.Path(__file__).parents[0]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    credentials = {'key': configuration['twitter']['consumer_key'],
                   'secret': configuration['twitter']['consumer_secret']}
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    dbm = DBManager('tweets')
    tm = TwitterAPIManager(credentials, dbm)
    for current_keyword, keyword_row in zip(keyword, k_metadata):
        logging.info('Searching tweets for %s' % current_keyword)
        if '@' in current_keyword:
            tm.search_tweets(configuration['tweets_qry'], current_keyword, 'user', k_metadata)
        else:
            tm.search_tweets(configuration['tweets_qry'], current_keyword, 'hashtag', k_metadata)
    logging.info('Evaluating the relevance of the new tweets...')
    te = TweetEvaluator()
    te.identify_relevant_tweets()
 def fix_value_of_candidatura(self):
     script_parent_dir = pathlib.Path(__file__).parents[1]
     config_fn = script_parent_dir.joinpath('config.json')
     configuration = get_config(config_fn)
     keyword, k_metadata = parse_metadata(configuration['metadata'])
     interested_data = []
     # keep metadata that refer to candidacies
     for kword, kmetada in zip(keyword, k_metadata):
         if kmetada['candidatura'] != '':
             kmetada.update({'keyword': kword})
             interested_data.append(kmetada)
     query = {'candidatura': ''}
     # select tweets without candidacy
     s_objs = self.__dbm.search(query)
     num_fixed_tweets = 0
     # iterate over tweets without candidacy and fix those
     # whose text mention a candidate or have hashtags
     # related to a candidacy
     for s_obj in s_objs:
         party = s_obj['partido_politico']
         movement = s_obj['movimiento']
         tweet = s_obj['tweet_obj']
         relevant_data = []
         candidacy = ''
         # keep metadata related to the political party
         # (and movement) of the tweet (s_obj)
         for ida in interested_data:
             if ida['partido_politico'] == party:
                 if movement != '':
                     if ida['movimiento'] == movement:
                         relevant_data.append(ida)
                 else:
                     relevant_data.append(ida)
         if len(relevant_data) > 0:
             # extract relevant information of the tweet. hashtags and mentions if
             # the tweet obj has these entities otherwise the text of the tweet
             if 'retweeted_status' in tweet.keys():
                 original_tweet = tweet['retweeted_status']
             else:
                 original_tweet = tweet
             if 'entities' in original_tweet.keys():
                 t_user_mentions = self.__get_screen_names(
                     original_tweet['entities']['user_mentions'])
                 t_hashtags = self.__get_hashtags(
                     original_tweet['entities']['hashtags'])
                 # see if the interested keywords are part of the tweet hashtags or mentions
                 for rd in relevant_data:
                     if rd['keyword'] in t_user_mentions:
                         candidacy = rd['candidatura']
                         break
                     else:
                         if rd['keyword'] in t_hashtags:
                             candidacy = rd['candidatura']
                             break
             else:
                 if 'full_text' in original_tweet.keys():
                     t_text = tweet['full_text']
                 else:
                     t_text = tweet['text']
                 # see if the interested keywords are present in the text
                 for rd in relevant_data:
                     if rd['keyword'] in t_text:
                         candidacy = rd['candidatura']
                         break
             # fix candidacy key
             if candidacy:
                 s_obj['candidatura'] = candidacy
                 num_fixed_tweets += 1
                 self.__dbm.save_record(s_obj)
     return num_fixed_tweets