Exemple #1
0
 def read_followers_for_candidate(cls, candidate):
     """ Read .csv file and load followers into database for specific candidate"""
     if RawFollowerDAO().candidate_was_loaded(candidate.screen_name):
         cls.get_logger().info(
             f'Candidate {candidate.screen_name} followers .csv file has already been loaded.'
         )
         return
     cls.get_logger().info(
         f'Loading .csv file for {candidate.screen_name}.')
     # Generate file path and open file
     path = cls.FOLLOWERS_PATH_FORMAT % candidate.nickname
     with open(path, 'r') as fd:
         reader = csv.reader(fd, delimiter=',')
         # Skip title
         title = next(reader)
         # Load followers
         for row in reader:
             # There are some cases were we have a second row with a title, so we'll skip it
             if row == title: continue
             follower = RawFollower(
                 **{
                     'id':
                     row[0],
                     'downloaded_on':
                     datetime.strptime(row[1], CSVUtils.DATE_FORMAT),
                     'follows':
                     candidate.screen_name
                 })
             RawFollowerDAO().put(follower)
     # Mark this candidate as already loaded.
     RawFollowerDAO().finish_candidate(candidate.screen_name)
     cls.get_logger().info(
         f'Finished loading {candidate.screen_name} raw followers from .csv file.'
     )
Exemple #2
0
 def update_dashboard_data():
     """ Recalculate non-counting dashboard data and store. """
     # Get total count of users
     users = RawFollowerDAO().get_count({})
     # Get count of active users
     active_users = RawFollowerDAO().get_count({'has_tweets': True})
     # Get count of followers for each candidate
     candidates = list(
         map(lambda c: c.screen_name,
             CandidateService().get_all()))
     followers_by_candidate = dict()
     for candidate in candidates:
         followers = RawFollowerDAO().get_count({'follows': candidate})
         active_followers = RawFollowerDAO().get_count({
             'follows': candidate,
             'has_tweets': True
         })
         followers_by_candidate[candidate] = {
             'followers': followers,
             'active_followers': active_followers,
             'proportion': active_followers / followers
         }
     # Get count of found topics
     topics = CooccurrenceGraphDAO().get_count(
         {'topic_id': {
             '$ne': 'main'
         }})
     DashboardDAO().store({
         'users': users,
         'active_users': active_users,
         'active_proportion': active_users / users,
         'followers_by_candidate': followers_by_candidate,
         'topics': topics
     })
Exemple #3
0
 def test_retrieve_users_by_party(self):
     document = {
         '_id': '123',
         'is_private': False,
         'has_tweets': True,
         'probability_vector_support': [0.8],
         'support': 'juntosporelcambio',
         'friends_count': 2500
     }
     RawFollowerDAO().insert(document)
     document = {
         '_id': '456',
         'is_private': False,
         'has_tweets': True,
         'probability_vector_support': [0.8],
         'support': 'frentedetodos',
         'friends_count': 2500
     }
     RawFollowerDAO().insert(document)
     document = {
         '_id': '789',
         'is_private': False,
         'has_tweets': True,
         'probability_vector_support': [0.8],
         'support': 'frentedetodos',
         'friends_count': 5001
     }
     RawFollowerDAO().insert(document)
     result = UserNetworkRetrievalService.retrieve_users_by_party()
     assert len(result['juntosporelcambio']) == 1
     assert result['juntosporelcambio'][0] == '123'
     assert len(result['frentedetodos']) == 1
     assert result['frentedetodos'][0] == '456'
Exemple #4
0
 def fix_followers_update(cls):
     followers = RawFollowerDAO().get_all({'downloaded_on': {'$gt': datetime.datetime(2019, 5, 29, 0, 0, 0)}})
     for follower in followers:
         real_follows = []
         for seguido in follower['follows']:
             if isinstance(seguido, str):
                 real_follows.append(seguido)
         RawFollowerDAO().update_follows(follower['_id'], real_follows)
Exemple #5
0
 def update_follower_with_no_tweets(cls, follower):
     """ Update follower's last download date. """
     try:
         raw_follower = RawFollowerDAO().get(follower)
         if not raw_follower.is_private:
             if not raw_follower.has_tweets:
                 raw_follower.has_tweets = False
             RawFollowerDAO().update_follower_downloaded_on(raw_follower)
             # cls.get_logger().info(f'{follower} is updated with 0 tweets.')
     except NonExistentRawFollowerError:
         cls.get_logger().error(f'Follower {follower} does not exists')
Exemple #6
0
    def get_grouped_users(cls, users_index):
        """ Return users grouped by candidates' support. """
        # Retrieve users which have tweets

        active_users = RawFollowerDAO().get_all({
            "$and": [
                {"probability_vector_support": {"$elemMatch": {"$gte": 0.8}}},
                {"has_tweets": True},
                {"important": {'$exists': False}}
            ]})
        users_by_group = {}
        for user in active_users:
            support_vector = user['probability_vector_support']
            max_probability_support = max(support_vector)
            user_id = user['_id']

            # User who have not one probability greater than limit, is discarded
            if max_probability_support <= 0.8 or user_id not in users_index:
                continue

            support_index = support_vector.index(max_probability_support)
            value = users_by_group.get(support_index, [])
            value.append([users_index[user['_id']], 0, 1])
            users_by_group[support_index] = value

        return users_by_group
Exemple #7
0
    def update_complete_follower(cls, follower, tweet, last_tweet_date):
        """ Update follower's last download date. """
        try:
            today = datetime.datetime.today()
            updated_raw_follower = RawFollower(
                **{
                    'id': follower,
                    'downloaded_on': today,
                    'last_tweet_date': last_tweet_date,
                    'is_private': False,
                    'has_tweets': True
                })

            if 'user' in tweet:
                user_information = tweet['user']
                updated_raw_follower.location = user_information['location']
                updated_raw_follower.followers_count = user_information[
                    'followers_count']
                updated_raw_follower.friends_count = user_information[
                    'friends_count']
                updated_raw_follower.listed_count = user_information[
                    'listed_count']
                updated_raw_follower.favourites_count = user_information[
                    'favourites_count']
                updated_raw_follower.statuses_count = user_information[
                    'statuses_count']

            RawFollowerDAO().update_follower_data_with_has_tweets(
                updated_raw_follower)

        except NonExistentRawFollowerError:
            cls.get_logger().error(f'Follower {follower} does not exists')
Exemple #8
0
def create_indexes():
    """ Create all required collection indexes. """
    CandidateDAO().create_indexes()
    RawFollowerDAO().create_indexes()
    # RawTweetDAO().create_indexes()
    UserHashtagDAO().create_indexes()
    CooccurrenceGraphDAO().create_indexes()
Exemple #9
0
 def load_followers(cls):
     with open('PUT PATH HERE', 'r') as fd:
         reader = csv.reader(fd, delimiter=',')
         for row in reader:
             follower = RawFollower(**{'_id': row[0],
                                       'downloaded_on': datetime.datetime.strptime(row[1],
                                                                                   PreProcessingTweetsUtil.DATE_FORMAT),
                                       'follows': 'prueba'})
             RawFollowerDAO().put(follower)
Exemple #10
0
 def update_follower_with_first_tweet(cls, follower, tweet):
     try:
         follower_result = RawFollowerDAO().get(follower)
         today = datetime.datetime.today()
         user_information = tweet['user']
         updated_raw_follower = RawFollower(**{'id': follower,
                                               'follows': follower_result.follows,
                                               'downloaded_on': today,
                                               'location': user_information['location'],
                                               'followers_count': user_information['followers_count'],
                                               'friends_count': user_information['friends_count'],
                                               'listed_count': user_information['listed_count'],
                                               'favourites_count': user_information['favourites_count'],
                                               'statuses_count': user_information['statuses_count']
                                               })
         RawFollowerDAO().put(updated_raw_follower)
     except NonExistentRawFollowerError:
         cls.get_logger().error(f'Follower {follower} does not exists')
Exemple #11
0
 def add_last_downloaded_followers(self):
     self.logger.info('Adding last downloaded followers')
     users_to_be_updated = RawFollowerDAO().get_all({
         '$and': [
             {'has_tweets': {'$exists': False}},
             {'is_private': {'$ne': True}}
         ]})
     followers = self.add_followers(users_to_be_updated)
     self.priority_updating_followers.update(followers)
     self.logger.info('Finishing insertion of last downloaded followers')
Exemple #12
0
 def add_followers_to_be_updated(self, timedelta=180):
     self.logger.info(
         f'Adding new followers to update their tweets. Actual size: {str(len(self.updating_followers))}')
     followers = RawFollowerDAO().get_random_followers_sample(list(self.processing_followers), timedelta)
     new_followers = self.add_followers(followers)
     if len(new_followers) == 0:
         # If there are no new results
         self.logger.error('Can\'t retrieve followers to update their tweets. ')
         raise NoMoreFollowersToUpdateTweetsError()
     self.updating_followers.update(new_followers)
Exemple #13
0
    def send_server_status(cls):
        if not EnvironmentUtils.is_prod(cls.__env): return
        yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
        followers_updated = RawFollowerDAO().get_users_updated_since_date(
            yesterday)
        tweets_updated = RawTweetDAO().get_count(
        )  # new_followers = CandidatesFollowersDAO().get()

        message = f'Cantidad de tweets descargados hasta el momento: {tweets_updated} \n ' \
            f'Usuarios actualizados durante el día de ayer: {followers_updated} \n'
        cls.post_message_to_channel(message)
Exemple #14
0
 def initialize_context(cls):
     """ Create instances of all environment services in a Spring-like fashion."""
     cls.LOGGER.info('Instantiating context services and components.')
     ConfigurationManager()
     ConcurrencyUtils()
     Scheduler()
     CandidateDAO()
     RawFollowerDAO()
     CandidatesFollowersDAO()
     CredentialService()
     CandidateService()
     FollowerUpdateService()
     TweetUpdateService()
     FollowersQueueService()
Exemple #15
0
 def populate_users_by_party_dict(cls):
     users_by_party = dict()
     for party in cls.__parties:
         documents = RawFollowerDAO().get_all({
             '$and': [{
                 'probability_vector_support': {
                     '$elemMatch': {
                         '$gte': 0.8
                     }
                 }
             }, {
                 'support': party
             }]
         })
         # Store list in party dictionary
         users_by_party[party] = {document['_id'] for document in documents}
     return users_by_party
Exemple #16
0
 def store_new_followers(cls, ids, candidate_name):
     """ Create RawFollower instances for the received data and store them in the database. Also, we will store
      the number of new followers downloaded each day. """
     today = datetime.today()
     # Create and store raw followers
     for follower_id in ids:
         raw_follower = RawFollower(
             **{
                 'id': follower_id,
                 'follows': candidate_name,
                 'downloaded_on': today
             })
         RawFollowerDAO().put(raw_follower)
     # Store the number of retrieved followers in the current day
     count = len(ids)
     CandidatesFollowersDAO().put_increase_for_candidate(
         candidate_name, count, today)
Exemple #17
0
 def update_follower_as_private(cls, follower):
     """ When an error occurs, follower is tagged as private. """
     try:
         # Retrieve the follower from DB
         today = datetime.datetime.today()
         updated_raw_follower = RawFollower(**{
             'id': follower,
             'downloaded_on': today,
             'is_private': True
         })
         RawFollowerDAO().update_follower_data_without_has_tweets(
             updated_raw_follower)
         # cls.get_logger().info(f'{follower} is tagged as private.')
     except NonExistentRawFollowerError as error:
         cls.get_logger().error(
             f'{follower} can not be tagged as private because does not exists.'
         )
         cls.get_logger().error(error)
Exemple #18
0
 def update_followers_for_candidate(cls, twitter, candidate):
     """ Update followers of given candidate with the given Twython instance. """
     cls.get_logger().info(
         f'Follower updating started for candidate {candidate.screen_name}.'
     )
     # Get already stored candidates
     candidate_followers_ids = RawFollowerDAO().get_candidate_followers_ids(
         candidate.screen_name)
     # Retrieve new candidates
     to_store_ids = cls.get_new_followers_ids(twitter, candidate,
                                              candidate_followers_ids)
     cls.get_logger().info(
         f'{len(to_store_ids)} new followers downloaded for candidate {candidate.screen_name}.'
     )
     # Once the downloading is done, we proceed to store the new followers
     cls.store_new_followers(to_store_ids, candidate.screen_name)
     cls.get_logger().info(
         f'Finished updating followers for candidate {candidate.screen_name}.'
     )
Exemple #19
0
 def __generate_supporters_map(cls):
     """ Creates a map which relates each party with a set of its followers. """
     supporters = dict()
     for party in cls.__parties:
         users = [
             follower['_id'] for follower in RawFollowerDAO().get_all({
                 '$and': [{
                     'probability_vector_support': {
                         '$elemMatch': {
                             '$gte': 0.8
                         }
                     }
                 }, {
                     'support': party
                 }]
             })
         ]
         supporters[party] = users
     return supporters
Exemple #20
0
    def get_users_rt_vector(cls):
        """ Get data from db and create users_rt_vectors. """
        # {candidate: index}, [candidate_id]
        candidate_index, candidates_list, candidate_group, candidates_rt_cursor = cls.get_necessary_data(
        )
        cls.get_logger().info(
            "Candidates and theirs rt are retrieved correctly.")
        groups_quantity = max(candidate_index.values()) + 1
        rt_vectors = {}
        for tweet in candidates_rt_cursor:
            # Get user information
            user = tweet['user_id']
            user_rt_vector = cls.get_user_vector_or_default(
                user, groups_quantity, rt_vectors)

            # If tweet creator is a candidate, plus one in user's vector
            user_tweet_creator = tweet['retweeted_status']['user'][
                'screen_name']
            if user_tweet_creator in candidates_list:
                user_rt_vector[candidate_index[user_tweet_creator]] += 1

            if sum(user_rt_vector) > 0:
                rt_vectors[user] = user_rt_vector

        users = RawFollowerDAO().get_all(
            {'first_rt_vector': {
                '$exists': True
            }})
        for user in users:

            user_id = user['_id']
            actual_rt_vector = rt_vectors.get(user_id, None)
            final_rt_vector = user['first_rt_vector']
            if actual_rt_vector:
                final_rt_vector = [
                    x + y for x, y in zip(actual_rt_vector, final_rt_vector)
                ]
            rt_vectors[user_id] = final_rt_vector

        cls.get_logger().info("RT vectors are created correctly.")
        return rt_vectors, candidate_index, groups_quantity, candidate_group
Exemple #21
0
    def update_support_follower(cls):
        """ Method for updating follower support's vector. """
        cls.get_logger().info("Starting FollowerSupport updating.")
        rt_vectors, candidate_index, groups_quantity, candidate_group = cls.get_users_rt_vector(
        )

        # Get followers which have tweets
        followers_with_tweets = RawFollowerDAO().get_all({'has_tweets': True})
        cls.get_logger().info("Calculating probability vector support.")
        for follower in followers_with_tweets:
            user_id = follower['_id']
            rt_vector = rt_vectors.get(user_id, [0] * groups_quantity)
            follows_vector = cls.get_follows_vector(follower, candidate_index,
                                                    groups_quantity)

            final_rt, final_follows = cls.get_final_vectors(
                rt_vector, follows_vector)
            # Calculate probability vector and save it
            probability_vector = [sum(x) for x in zip(final_rt, final_follows)]
            cls.save_follower_vectors(user_id, probability_vector, rt_vector,
                                      candidate_group)
        cls.get_logger().info("Finishing FollowerSupport updating.")
Exemple #22
0
 def export_counts_for_time_window(cls, start_date, end_date):
     """ Count appearances of each pair of hashtags in the given time window and export to .txt file. """
     cls.get_logger().info(f'Starting hashtag cooccurrence counting for window starting on {start_date}'
                           f' and ending on {end_date}')
     counts = dict()
     ids = dict()
     # Get ids of non-important users and ignore their cooccurrences.
     non_important_users = RawFollowerDAO().find_non_important_users()
     # Retrieve from database
     documents = CooccurrenceDAO().find_in_window(start_date, end_date, non_important_users)
     # Iterate and count
     hashtag_entropy_service = HashtagEntropyService()
     for document in documents:
         # Add only those edges that join two hashtags that should be considered for graph construction
         if not hashtag_entropy_service.should_use_pair(document['pair']): continue
         # If both are acceptable, then add edge
         cls.__add_to_counts(counts, document['pair'])
         cls.__add_to_ids(ids, document['pair'])
     # Throw exception if there were no documents found
     if len(counts) == 0:
         raise NoHashtagCooccurrenceError(start_date, end_date)
     # Write weights file
     file_name = cls.__make_file_name('weights', start_date, end_date)
     with open(f'{cls.DIR_PATH}/{file_name}', 'w') as fd:
         # Write a line for each pair of hashtags
         for pair, count in OrderedDict(sorted(counts.items(), key=lambda item: item[1], reverse=True)).items():
             # Leave out all edges with weight less than 3, we don't care about them
             if count <= 2: continue
             pair = pair.split('-')
             fd.write(f'{ids[pair[0]]} {ids[pair[1]]} {count}\n')
     cls.get_logger().info(f'Counting result was written in file {file_name}')
     # Write id reference file
     file_name = cls.__make_file_name('ids', start_date, end_date)
     with open(f'{cls.DIR_PATH}/{file_name}', 'w') as fd:
         # Write a line for each hashtag
         for hashtag, uuid in ids.items():
             fd.write(f'{uuid} {hashtag}\n')
     cls.get_logger().info(f'Hashtag ids were written in file {file_name}')
Exemple #23
0
class TestRawFollowerDAO(CustomTestCase):
    def setUp(self) -> None:
        super(TestRawFollowerDAO, self).setUp()
        Mongo().db = mongomock.database.Database(mongomock.MongoClient(),
                                                 'elections',
                                                 _store=None)
        self.target = RawFollowerDAO()

    def tearDown(self) -> None:
        # This has to be done because we are testing a Singleton
        RawFollowerDAO._instances.clear()

    def test_put_new_raw_follower(self):
        date = datetime.strptime('1996-03-15', CSVUtils.DATE_FORMAT)
        raw_follower = RawFollower(**{
            'id': 'test',
            'downloaded_on': date,
            'follows': 'bodart'
        })
        self.target.put(raw_follower)
        stored = self.target.get('test')
        assert stored is not None
        assert stored.follows == ['bodart']
        assert stored.downloaded_on == date
        assert not stored.is_private

    def test_update_raw_follower(self):
        date = datetime.strptime('1996-03-15', CSVUtils.DATE_FORMAT)
        raw_follower = RawFollower(**{
            'id': 'test',
            'downloaded_on': date,
            'follows': 'bodart'
        })
        self.target.put(raw_follower)
        raw_follower = RawFollower(**{
            'id': 'test',
            'downloaded_on': date,
            'follows': 'the_commander'
        })
        self.target.put(raw_follower)
        stored = self.target.get('test')
        assert stored is not None
        assert 'bodart' in stored.follows
        assert 'the_commander' in stored.follows
        assert stored.downloaded_on == date

    def test_get_non_existent_raw_follower(self):
        with self.assertRaises(NonExistentRawFollowerError) as context:
            _ = self.target.get('test')
        assert context.exception is not None
        assert context.exception.message == "There is no raw follower with id 'test' in the database."

    def test_finish_candidate_check_if_was_loaded(self):
        self.target.finish_candidate('test')
        assert self.target.candidate_was_loaded('test')

    def test_candidate_was_loaded_false(self):
        assert not self.target.candidate_was_loaded('test')

    def test_get_candidates_followers_ids(self):
        for i in range(20):
            self.target.put(RawFollower(**{'id': i, 'follows': 'bodart'}))
        result = self.target.get_candidate_followers_ids('bodart')
        assert len(result) == 20
        assert {i for i in range(20)} == result

    def test_put_public_on_private_user_stays_private(self):
        private_follower = RawFollower(**{'id': 'test', 'is_private': True})
        self.target.put(private_follower)
        public_follower = RawFollower(**{'id': 'test'})
        self.target.put(public_follower)
        stored = self.target.get('test')
        assert stored is not None
        assert stored.is_private

    def test_tag_as_private_ok(self):
        public_follower = RawFollower(**{'id': 'test'})
        self.target.put(public_follower)
        self.target.tag_as_private(public_follower)
        stored = self.target.get('test')
        assert stored.is_private

    def test_get_public_users(self):
        private_follower = RawFollower(**{'id': 'test_1', 'is_private': True})
        self.target.put(private_follower)
        public_follower = RawFollower(**{'id': 'test_2'})
        self.target.put(public_follower)
        stored = self.target.get_public_users()
        assert stored is not None
        assert stored == {'test_2'}

    def test_get_public_users_empty(self):
        # This should never happen anyway
        private_follower = RawFollower(**{'id': 'test_1', 'is_private': True})
        self.target.put(private_follower)
        stored = self.target.get_public_users()
        assert not stored

    def test_get_all_with_cursor(self):
        # Add many followers
        for i in range(0, 20):
            self.target.put(RawFollower(**{'id': i}))
        # Get first 10
        first_10 = self.target.get_all_with_cursor(0, 10)
        assert len(first_10) == 10
        for follower in first_10:
            assert follower['id'] < 10
        # Get last 10
        last_10 = self.target.get_all_with_cursor(10, 10)
        assert len(last_10) == 10
        for follower in last_10:
            assert 10 <= follower['id'] < 20
        # Check there are no overlaps
        assert {follower['id']
                for follower in last_10
                }.intersection({follower['id']
                                for follower in first_10}) == set()

    def test_get_following_with_cursor(self):
        # Add many followers
        for i in range(0, 20):
            if i % 2 == 0:
                follower = RawFollower(**{'id': i, 'follows': 'bodart'})
            else:
                follower = RawFollower(**{'id': i, 'follows': 'the_commander'})
            self.target.put(follower)
        # Get first 10
        first_10 = self.target.get_following_with_cursor('bodart', 0, 100)
        assert len(first_10) == 10
        assert {follower['id']
                for follower in first_10
                } == {i
                      for i in range(0, 20) if i % 2 == 0}
        # Check there are only 10
        next_followers = self.target.get_following_with_cursor(
            'bodart', 10, 10)
        assert len(next_followers) == 0

    def test_get_following_with_cursor_non_existent_candidate_raises_exception(
            self):
        with self.assertRaises(NoDocumentsFoundError) as context:
            _ = self.target.get_following_with_cursor('bodart', 0, 100)
        assert context.exception is not None
        message = 'No documents found on collection raw_followers with query screen_name=bodart.'
        assert context.exception.message == message
Exemple #24
0
 def setUp(self) -> None:
     super(TestRawFollowerDAO, self).setUp()
     Mongo().db = mongomock.database.Database(mongomock.MongoClient(),
                                              'elections',
                                              _store=None)
     self.target = RawFollowerDAO()
Exemple #25
0
 def update_followers_vector(cls, user, data):
     """ For every user, update their rt_vector. """
     RawFollowerDAO().update_first({'_id': user}, data)