Example #1
0
 def __init__(self):
     self.__dbm_tweets = DBManager('tweets')
     self.__dbm_users = DBManager('users')
     name_config_file = pathlib.Path(__file__).parents[1].joinpath('config.json')
     conf = get_config(name_config_file)
     auth = tweepy.AppAuthHandler(conf['twitter']['consumer_key'], conf['twitter']['consumer_secret'])
     self.__api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
Example #2
0
def save_original_tweets_file():
    dbm = DBManager('tweets')
    query = {
        'tweet_obj.retweeted_status': {
            '$exists': 0
        },
        'sentimiento': {
            '$exists': 1
        },
    }
    s_objs = dbm.search(query)
    with open('tweet_sentiments.csv', 'w', encoding='utf-8') as f_csv:
        fieldnames = ['id', 'text', 'tone', 'score']
        writer = csv.DictWriter(f_csv, fieldnames=fieldnames)
        writer.writeheader()
        for s_obj in s_objs:
            tweet = s_obj['tweet_obj']
            if 'full_text' in tweet.keys():
                tweet_text = tweet['full_text']
            else:
                tweet_text = tweet['text']
            tweet_text = clean_emojis(tweet_text)
            tweet_text = tweet_text.replace('\r', '')
            tweet_text = tweet_text.replace('\n', '')
            tweet_text = tweet_text.replace(',', '')
            tweet_dict = {
                'id': tweet['id_str'],
                'text': tweet_text,
                'tone': s_obj['sentimiento']['tono'],
                'score': s_obj['sentimiento']['score']
            }
            writer.writerow(tweet_dict)
Example #3
0
def __db_trustworthy_users(db_users, db_tweets, config):
    """
    Generate a database of trustworthy users. We trust in the user if she
    has a verified account or has more than X number of followers
    
    :param db_users: database of user
    :param config: dictionary with the configuration parameters of the heuristic

    :return: database of trustworthy users
    """
    trustworthy_users_db = DBManager('trustworthy_users')
    if trustworthy_users_db.num_records_collection() == 0:
        logging.info('The trustworthy_users collection is being created...')
        for doc in db_users.find_all():
            data = get_user(db_tweets, doc['screen_name'])
            if data['verified'] or int(
                    data['followers_count']) > config['min_num_followers']:
                if not trustworthy_users_db.find_record(
                    {'screen_name': data['screen_name']}):
                    trustworthy_users_db.save_record({
                        'screen_name':
                        doc['screen_name'],
                        'name':
                        data['name'],
                        'created_at':
                        data['created_at'],
                        'followers_count':
                        data['followers_count'],
                        'verified':
                        data['verified']
                    })
    return trustworthy_users_db
Example #4
0
    def compute_bot_probability(self, users, source_users_collection = "", source_users_db = ""):
        reusers_db = None
        if source_users_db and source_users_collection:
            reusers_db = DBManager(source_users_collection, source_users_db)

        if not users:
            # Get all users who don't have the analysis of bot in current user
            users = self.__dbm_users.search({'bot_analysis': {'$exists': 0}})

        tot_user = len(users) if type(users) == list else users.count()
        idx_user = 1
        for user in users:
            logging.info('Remaining users: {0}'.format(tot_user-idx_user))
            if reusers_db:
                reuser_cursor = reusers_db.search({'screen_name': user['screen_name']})

                if reuser_cursor.count() > 0:
                    logging.info('Reusing bot analysis from another DB for {0}'.format(user['screen_name']))
                    reuser = reuser_cursor[0]
                    bot_analysis = reuser['bot_analysis']
                    self.__save_user_pbb(reuser['screen_name'], bot_analysis['pbb'], bot_analysis['raw_score'],
                                         bot_analysis['features'], bot_analysis['num_evaluated_heuristics'],
                                         bot_analysis['sum_weights'], reuser['exists'])
                    continue

            if type(users) == list:
                user_screen_name = user
            else:
                user_screen_name = user['screen_name']
            self.__compute_heuristics(user_screen_name)
            idx_user += 1
Example #5
0
def create_week_view(name, source, start_date, end_date, config_file):
    dbm = DBManager(config_fn=config_file)
    if not start_date:
        start_date = datetime.today().strftime('%Y-%m-%d')
    query = [{'$match': {'created_at_date': {'$gte': start_date}}}]
    if end_date:
        query[0]['$match']['created_at_date'].update({'$lte': end_date})
    dbm.create_view(name, source, query)
Example #6
0
def run_bot_detector(users, reusedb, fakepromoter):
    bot_detector = BotDetector()
    # create database of user if it doesn't exist
    users_db = DBManager('users')
    if users_db.num_records_collection() == 0:
        na = NetworkAnalyzer()
        na.create_users_db()

    bot_detector.compute_bot_probability(users, 'users', reusedb)

    if fakepromoter:
        bot_detector.compute_fake_promoter_heuristic(users)
Example #7
0
def fix_tweets_with_empty_flags():
    dbm = DBManager('tweets')
    script_parent_dir = pathlib.Path(__file__).parents[1]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1})
    for tweet in tweets_with_empty_flags:
        logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str']))
        flag, headers = create_flag(k_metadata)
        entities = get_entities_tweet(tweet['tweet_obj'])
        flag = add_values_to_flags(flag, entities, k_metadata)
        dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag)


#if __name__ == '__main__':
#    fix_tweets_with_empty_flags()
Example #8
0
def do_tweet_collection():
    script_parent_dir = pathlib.Path(__file__).parents[0]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    credentials = {'key': configuration['twitter']['consumer_key'],
                   'secret': configuration['twitter']['consumer_secret']}
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    dbm = DBManager('tweets')
    tm = TwitterAPIManager(credentials, dbm)
    for current_keyword, keyword_row in zip(keyword, k_metadata):
        logging.info('Searching tweets for %s' % current_keyword)
        if '@' in current_keyword:
            tm.search_tweets(configuration['tweets_qry'], current_keyword, 'user', k_metadata)
        else:
            tm.search_tweets(configuration['tweets_qry'], current_keyword, 'hashtag', k_metadata)
    logging.info('Evaluating the relevance of the new tweets...')
    te = TweetEvaluator()
    te.identify_relevant_tweets()
Example #9
0
def compute_tweets_local_date(force_computation=False, include_hour=False):
    dbm = DBManager('tweets')
    if force_computation:
        query = {}
    else:
        query = {'tweet_py_datetime': {'$exists': 0}}
    s_objs = dbm.search(query, only_relevant_tws=False)
    for s_obj in s_objs:
        tweet = s_obj['tweet_obj']
        py_pub_dt = get_py_date(tweet)
        dict_to_update = {
            'tweet_py_datetime': datetime.strftime(py_pub_dt,
                                                   '%m/%d/%y %H:%M:%S'),
            'tweet_py_date': datetime.strftime(py_pub_dt, '%m/%d/%y')
        }
        if include_hour:
            dict_to_update.update(
                {'tweet_py_hour': datetime.strftime(py_pub_dt, '%H')})
        dbm.update_record({'tweet_obj.id_str': tweet['id_str']},
                          dict_to_update)
    return
Example #10
0
 def __init__(self):
     self.__dbm_tweets = DBManager('tweets')
     self.__dbm_users = DBManager('users')
     self.__dbm_networks = DBManager('networks')
     self.__network = []
Example #11
0
def add_video_property(use_video_config_api=False, user_bearer=None):
    db = DBManager('tweets')
    plain_tweets = db.get_plain_tweets()
    tot_plain_tweets = len(plain_tweets)
    logging.info('Plain tweets {0}'.format(tot_plain_tweets))
    tweet_counter = 0

    if not use_video_config_api:
        driver = webdriver.Chrome()

    for plain_tweet in plain_tweets:
        tweet_counter += 1
        response = None
        # video_config_api_response = None
        # if 'video_config_api' in plain_tweet.keys():
        #     video_config_api_response = plain_tweet['video_config_api']['is_video_response']
        if 'video_embed_url' in plain_tweet.keys():
            video_config_api_response = plain_tweet['video_config_api'][
                'is_video_response']

        logging.info('Remaining tweets: {0}'.format(tot_plain_tweets -
                                                    tweet_counter))
        id_tweet = plain_tweet['tweet_obj']['id_str']
        found_message = False
        method = "video_config_api"
        result_value = None
        result_status = None
        result_headers = None

        previous_responses = {}
        previous_responses['noexist'] = "Sorry, that page does not exist"
        previous_responses['empty'] = "b''"
        previous_responses['limit'] = "Rate limit exceeded"
        previous_responses['nomedia'] = "The media could not be played"

        # proceed = False
        # if video_config_api_response:
        #     if video_config_api_response.__contains__(previous_responses['noexist']) or video_config_api_response == previous_responses['empty'] or video_config_api_response.__contains__(previous_responses['limit']):
        #         logging.info('Processing tweet that got response: {0}'.format(video_config_api_response))
        #         proceed = True
        #
        # if not proceed:
        #     continue

        if not use_video_config_api:
            method = "video_embed_url"
            video_url = 'https://twitter.com/i/videos/'
            url = video_url + id_tweet
            driver.get(url)
            time.sleep(10)
            spans = driver.find_elements_by_tag_name('span')
            span_texts = [span.text for span in spans]
            result_value = str(span_texts)
            for span_text in span_texts:
                if span_text == 'The media could not be played.':
                    found_message = True
                    break
        else:
            import http.client
            response = get_video_config_with_user_bearer(user_bearer, id_tweet)
            curr_rate_limit_remaining_header = response.headers[
                'x-rate-limit-remaining']
            curr_rate_limit_remaining = 0
            if curr_rate_limit_remaining_header:
                curr_rate_limit_remaining = int(
                    curr_rate_limit_remaining_header)
            curr_time = calendar.timegm(time.gmtime())
            curr_rate_limit_expiration_header = response.headers[
                'x-rate-limit-reset']
            curr_rate_limit_expiration = curr_time
            if curr_rate_limit_expiration_header:
                curr_rate_limit_expiration = int(
                    curr_rate_limit_expiration_header)
            seconds_until_expiration = curr_rate_limit_expiration - curr_time

            result_value = str(response.read())
            result_headers = str(response.headers)
            result_status = str(response.status)

            if response.status != http.client.OK:
                found_message = True

            if curr_rate_limit_remaining == 0:
                logging.info(
                    '\n\nProcessed {0} tweets Twitter API rate limit exceeded. Waiting for {1} seconds'
                    .format(tweet_counter, seconds_until_expiration + 1))
                time.sleep(seconds_until_expiration + 1)

        update_object = {}
        if found_message:
            logging.info(
                '\n\nThe tweet {0} DOES NOT have a video! Response STATUS = \n{1}, HEADERS = \n{2}, \nBODY = {3} \n'
                .format(id_tweet, result_status, result_headers, result_value))
            update_object[method] = {
                'is_video': 0,
                'is_video_response': result_value
            }
            db.update_record({'tweet_obj.id_str': id_tweet}, update_object)
        else:
            logging.info(
                '\n\nThe tweet {0} HAS a video! Response STATUS = {1}, HEADERS = {2} \n'
                .format(id_tweet, result_status, result_headers))
            update_object[method] = {
                'is_video': 1,
                'is_video_response': result_value
            }
            db.update_record({'tweet_obj.id_str': id_tweet}, update_object)
Example #12
0
 def __init__(self):
     self.user_handlers, self.hashtags = get_user_handlers_and_hashtags()
     self.__dbm = DBManager('tweets')
Example #13
0
def remove_view(name, config_file):
    dbm = DBManager(config_fn=config_file)
    dbm.drop_collection(name)
Example #14
0
 def __init__(self):
     self.db_tweets = DBManager('tweets')
     self.db_users = DBManager('users')
     self.hashtags, self.metadata = self.__get_hashtags_and_metadata()
Example #15
0
 def __init__(self):
     self.db_tweets = DBManager('tweets')
     self.db_users = DBManager('users')
Example #16
0
 def __init__(self, collection='tweets', language='spanish'):
     self.config = get_config(self.config_file_name)
     self.language = language
     self.__dbm = DBManager(collection)