def read_followers_for_candidate(cls, candidate): """ Read .csv file and load followers into database for specific candidate""" if RawFollowerDAO().candidate_was_loaded(candidate.screen_name): cls.get_logger().info( f'Candidate {candidate.screen_name} followers .csv file has already been loaded.' ) return cls.get_logger().info( f'Loading .csv file for {candidate.screen_name}.') # Generate file path and open file path = cls.FOLLOWERS_PATH_FORMAT % candidate.nickname with open(path, 'r') as fd: reader = csv.reader(fd, delimiter=',') # Skip title title = next(reader) # Load followers for row in reader: # There are some cases were we have a second row with a title, so we'll skip it if row == title: continue follower = RawFollower( **{ 'id': row[0], 'downloaded_on': datetime.strptime(row[1], CSVUtils.DATE_FORMAT), 'follows': candidate.screen_name }) RawFollowerDAO().put(follower) # Mark this candidate as already loaded. RawFollowerDAO().finish_candidate(candidate.screen_name) cls.get_logger().info( f'Finished loading {candidate.screen_name} raw followers from .csv file.' )
def update_dashboard_data(): """ Recalculate non-counting dashboard data and store. """ # Get total count of users users = RawFollowerDAO().get_count({}) # Get count of active users active_users = RawFollowerDAO().get_count({'has_tweets': True}) # Get count of followers for each candidate candidates = list( map(lambda c: c.screen_name, CandidateService().get_all())) followers_by_candidate = dict() for candidate in candidates: followers = RawFollowerDAO().get_count({'follows': candidate}) active_followers = RawFollowerDAO().get_count({ 'follows': candidate, 'has_tweets': True }) followers_by_candidate[candidate] = { 'followers': followers, 'active_followers': active_followers, 'proportion': active_followers / followers } # Get count of found topics topics = CooccurrenceGraphDAO().get_count( {'topic_id': { '$ne': 'main' }}) DashboardDAO().store({ 'users': users, 'active_users': active_users, 'active_proportion': active_users / users, 'followers_by_candidate': followers_by_candidate, 'topics': topics })
def test_retrieve_users_by_party(self): document = { '_id': '123', 'is_private': False, 'has_tweets': True, 'probability_vector_support': [0.8], 'support': 'juntosporelcambio', 'friends_count': 2500 } RawFollowerDAO().insert(document) document = { '_id': '456', 'is_private': False, 'has_tweets': True, 'probability_vector_support': [0.8], 'support': 'frentedetodos', 'friends_count': 2500 } RawFollowerDAO().insert(document) document = { '_id': '789', 'is_private': False, 'has_tweets': True, 'probability_vector_support': [0.8], 'support': 'frentedetodos', 'friends_count': 5001 } RawFollowerDAO().insert(document) result = UserNetworkRetrievalService.retrieve_users_by_party() assert len(result['juntosporelcambio']) == 1 assert result['juntosporelcambio'][0] == '123' assert len(result['frentedetodos']) == 1 assert result['frentedetodos'][0] == '456'
def fix_followers_update(cls): followers = RawFollowerDAO().get_all({'downloaded_on': {'$gt': datetime.datetime(2019, 5, 29, 0, 0, 0)}}) for follower in followers: real_follows = [] for seguido in follower['follows']: if isinstance(seguido, str): real_follows.append(seguido) RawFollowerDAO().update_follows(follower['_id'], real_follows)
def update_follower_with_no_tweets(cls, follower): """ Update follower's last download date. """ try: raw_follower = RawFollowerDAO().get(follower) if not raw_follower.is_private: if not raw_follower.has_tweets: raw_follower.has_tweets = False RawFollowerDAO().update_follower_downloaded_on(raw_follower) # cls.get_logger().info(f'{follower} is updated with 0 tweets.') except NonExistentRawFollowerError: cls.get_logger().error(f'Follower {follower} does not exists')
def get_grouped_users(cls, users_index): """ Return users grouped by candidates' support. """ # Retrieve users which have tweets active_users = RawFollowerDAO().get_all({ "$and": [ {"probability_vector_support": {"$elemMatch": {"$gte": 0.8}}}, {"has_tweets": True}, {"important": {'$exists': False}} ]}) users_by_group = {} for user in active_users: support_vector = user['probability_vector_support'] max_probability_support = max(support_vector) user_id = user['_id'] # User who have not one probability greater than limit, is discarded if max_probability_support <= 0.8 or user_id not in users_index: continue support_index = support_vector.index(max_probability_support) value = users_by_group.get(support_index, []) value.append([users_index[user['_id']], 0, 1]) users_by_group[support_index] = value return users_by_group
def update_complete_follower(cls, follower, tweet, last_tweet_date): """ Update follower's last download date. """ try: today = datetime.datetime.today() updated_raw_follower = RawFollower( **{ 'id': follower, 'downloaded_on': today, 'last_tweet_date': last_tweet_date, 'is_private': False, 'has_tweets': True }) if 'user' in tweet: user_information = tweet['user'] updated_raw_follower.location = user_information['location'] updated_raw_follower.followers_count = user_information[ 'followers_count'] updated_raw_follower.friends_count = user_information[ 'friends_count'] updated_raw_follower.listed_count = user_information[ 'listed_count'] updated_raw_follower.favourites_count = user_information[ 'favourites_count'] updated_raw_follower.statuses_count = user_information[ 'statuses_count'] RawFollowerDAO().update_follower_data_with_has_tweets( updated_raw_follower) except NonExistentRawFollowerError: cls.get_logger().error(f'Follower {follower} does not exists')
def create_indexes(): """ Create all required collection indexes. """ CandidateDAO().create_indexes() RawFollowerDAO().create_indexes() # RawTweetDAO().create_indexes() UserHashtagDAO().create_indexes() CooccurrenceGraphDAO().create_indexes()
def load_followers(cls): with open('PUT PATH HERE', 'r') as fd: reader = csv.reader(fd, delimiter=',') for row in reader: follower = RawFollower(**{'_id': row[0], 'downloaded_on': datetime.datetime.strptime(row[1], PreProcessingTweetsUtil.DATE_FORMAT), 'follows': 'prueba'}) RawFollowerDAO().put(follower)
def update_follower_with_first_tweet(cls, follower, tweet): try: follower_result = RawFollowerDAO().get(follower) today = datetime.datetime.today() user_information = tweet['user'] updated_raw_follower = RawFollower(**{'id': follower, 'follows': follower_result.follows, 'downloaded_on': today, 'location': user_information['location'], 'followers_count': user_information['followers_count'], 'friends_count': user_information['friends_count'], 'listed_count': user_information['listed_count'], 'favourites_count': user_information['favourites_count'], 'statuses_count': user_information['statuses_count'] }) RawFollowerDAO().put(updated_raw_follower) except NonExistentRawFollowerError: cls.get_logger().error(f'Follower {follower} does not exists')
def add_last_downloaded_followers(self): self.logger.info('Adding last downloaded followers') users_to_be_updated = RawFollowerDAO().get_all({ '$and': [ {'has_tweets': {'$exists': False}}, {'is_private': {'$ne': True}} ]}) followers = self.add_followers(users_to_be_updated) self.priority_updating_followers.update(followers) self.logger.info('Finishing insertion of last downloaded followers')
def add_followers_to_be_updated(self, timedelta=180): self.logger.info( f'Adding new followers to update their tweets. Actual size: {str(len(self.updating_followers))}') followers = RawFollowerDAO().get_random_followers_sample(list(self.processing_followers), timedelta) new_followers = self.add_followers(followers) if len(new_followers) == 0: # If there are no new results self.logger.error('Can\'t retrieve followers to update their tweets. ') raise NoMoreFollowersToUpdateTweetsError() self.updating_followers.update(new_followers)
def send_server_status(cls): if not EnvironmentUtils.is_prod(cls.__env): return yesterday = datetime.datetime.today() - datetime.timedelta(days=1) followers_updated = RawFollowerDAO().get_users_updated_since_date( yesterday) tweets_updated = RawTweetDAO().get_count( ) # new_followers = CandidatesFollowersDAO().get() message = f'Cantidad de tweets descargados hasta el momento: {tweets_updated} \n ' \ f'Usuarios actualizados durante el día de ayer: {followers_updated} \n' cls.post_message_to_channel(message)
def initialize_context(cls): """ Create instances of all environment services in a Spring-like fashion.""" cls.LOGGER.info('Instantiating context services and components.') ConfigurationManager() ConcurrencyUtils() Scheduler() CandidateDAO() RawFollowerDAO() CandidatesFollowersDAO() CredentialService() CandidateService() FollowerUpdateService() TweetUpdateService() FollowersQueueService()
def populate_users_by_party_dict(cls): users_by_party = dict() for party in cls.__parties: documents = RawFollowerDAO().get_all({ '$and': [{ 'probability_vector_support': { '$elemMatch': { '$gte': 0.8 } } }, { 'support': party }] }) # Store list in party dictionary users_by_party[party] = {document['_id'] for document in documents} return users_by_party
def store_new_followers(cls, ids, candidate_name): """ Create RawFollower instances for the received data and store them in the database. Also, we will store the number of new followers downloaded each day. """ today = datetime.today() # Create and store raw followers for follower_id in ids: raw_follower = RawFollower( **{ 'id': follower_id, 'follows': candidate_name, 'downloaded_on': today }) RawFollowerDAO().put(raw_follower) # Store the number of retrieved followers in the current day count = len(ids) CandidatesFollowersDAO().put_increase_for_candidate( candidate_name, count, today)
def update_follower_as_private(cls, follower): """ When an error occurs, follower is tagged as private. """ try: # Retrieve the follower from DB today = datetime.datetime.today() updated_raw_follower = RawFollower(**{ 'id': follower, 'downloaded_on': today, 'is_private': True }) RawFollowerDAO().update_follower_data_without_has_tweets( updated_raw_follower) # cls.get_logger().info(f'{follower} is tagged as private.') except NonExistentRawFollowerError as error: cls.get_logger().error( f'{follower} can not be tagged as private because does not exists.' ) cls.get_logger().error(error)
def update_followers_for_candidate(cls, twitter, candidate): """ Update followers of given candidate with the given Twython instance. """ cls.get_logger().info( f'Follower updating started for candidate {candidate.screen_name}.' ) # Get already stored candidates candidate_followers_ids = RawFollowerDAO().get_candidate_followers_ids( candidate.screen_name) # Retrieve new candidates to_store_ids = cls.get_new_followers_ids(twitter, candidate, candidate_followers_ids) cls.get_logger().info( f'{len(to_store_ids)} new followers downloaded for candidate {candidate.screen_name}.' ) # Once the downloading is done, we proceed to store the new followers cls.store_new_followers(to_store_ids, candidate.screen_name) cls.get_logger().info( f'Finished updating followers for candidate {candidate.screen_name}.' )
def __generate_supporters_map(cls): """ Creates a map which relates each party with a set of its followers. """ supporters = dict() for party in cls.__parties: users = [ follower['_id'] for follower in RawFollowerDAO().get_all({ '$and': [{ 'probability_vector_support': { '$elemMatch': { '$gte': 0.8 } } }, { 'support': party }] }) ] supporters[party] = users return supporters
def get_users_rt_vector(cls): """ Get data from db and create users_rt_vectors. """ # {candidate: index}, [candidate_id] candidate_index, candidates_list, candidate_group, candidates_rt_cursor = cls.get_necessary_data( ) cls.get_logger().info( "Candidates and theirs rt are retrieved correctly.") groups_quantity = max(candidate_index.values()) + 1 rt_vectors = {} for tweet in candidates_rt_cursor: # Get user information user = tweet['user_id'] user_rt_vector = cls.get_user_vector_or_default( user, groups_quantity, rt_vectors) # If tweet creator is a candidate, plus one in user's vector user_tweet_creator = tweet['retweeted_status']['user'][ 'screen_name'] if user_tweet_creator in candidates_list: user_rt_vector[candidate_index[user_tweet_creator]] += 1 if sum(user_rt_vector) > 0: rt_vectors[user] = user_rt_vector users = RawFollowerDAO().get_all( {'first_rt_vector': { '$exists': True }}) for user in users: user_id = user['_id'] actual_rt_vector = rt_vectors.get(user_id, None) final_rt_vector = user['first_rt_vector'] if actual_rt_vector: final_rt_vector = [ x + y for x, y in zip(actual_rt_vector, final_rt_vector) ] rt_vectors[user_id] = final_rt_vector cls.get_logger().info("RT vectors are created correctly.") return rt_vectors, candidate_index, groups_quantity, candidate_group
def update_support_follower(cls): """ Method for updating follower support's vector. """ cls.get_logger().info("Starting FollowerSupport updating.") rt_vectors, candidate_index, groups_quantity, candidate_group = cls.get_users_rt_vector( ) # Get followers which have tweets followers_with_tweets = RawFollowerDAO().get_all({'has_tweets': True}) cls.get_logger().info("Calculating probability vector support.") for follower in followers_with_tweets: user_id = follower['_id'] rt_vector = rt_vectors.get(user_id, [0] * groups_quantity) follows_vector = cls.get_follows_vector(follower, candidate_index, groups_quantity) final_rt, final_follows = cls.get_final_vectors( rt_vector, follows_vector) # Calculate probability vector and save it probability_vector = [sum(x) for x in zip(final_rt, final_follows)] cls.save_follower_vectors(user_id, probability_vector, rt_vector, candidate_group) cls.get_logger().info("Finishing FollowerSupport updating.")
def export_counts_for_time_window(cls, start_date, end_date): """ Count appearances of each pair of hashtags in the given time window and export to .txt file. """ cls.get_logger().info(f'Starting hashtag cooccurrence counting for window starting on {start_date}' f' and ending on {end_date}') counts = dict() ids = dict() # Get ids of non-important users and ignore their cooccurrences. non_important_users = RawFollowerDAO().find_non_important_users() # Retrieve from database documents = CooccurrenceDAO().find_in_window(start_date, end_date, non_important_users) # Iterate and count hashtag_entropy_service = HashtagEntropyService() for document in documents: # Add only those edges that join two hashtags that should be considered for graph construction if not hashtag_entropy_service.should_use_pair(document['pair']): continue # If both are acceptable, then add edge cls.__add_to_counts(counts, document['pair']) cls.__add_to_ids(ids, document['pair']) # Throw exception if there were no documents found if len(counts) == 0: raise NoHashtagCooccurrenceError(start_date, end_date) # Write weights file file_name = cls.__make_file_name('weights', start_date, end_date) with open(f'{cls.DIR_PATH}/{file_name}', 'w') as fd: # Write a line for each pair of hashtags for pair, count in OrderedDict(sorted(counts.items(), key=lambda item: item[1], reverse=True)).items(): # Leave out all edges with weight less than 3, we don't care about them if count <= 2: continue pair = pair.split('-') fd.write(f'{ids[pair[0]]} {ids[pair[1]]} {count}\n') cls.get_logger().info(f'Counting result was written in file {file_name}') # Write id reference file file_name = cls.__make_file_name('ids', start_date, end_date) with open(f'{cls.DIR_PATH}/{file_name}', 'w') as fd: # Write a line for each hashtag for hashtag, uuid in ids.items(): fd.write(f'{uuid} {hashtag}\n') cls.get_logger().info(f'Hashtag ids were written in file {file_name}')
class TestRawFollowerDAO(CustomTestCase): def setUp(self) -> None: super(TestRawFollowerDAO, self).setUp() Mongo().db = mongomock.database.Database(mongomock.MongoClient(), 'elections', _store=None) self.target = RawFollowerDAO() def tearDown(self) -> None: # This has to be done because we are testing a Singleton RawFollowerDAO._instances.clear() def test_put_new_raw_follower(self): date = datetime.strptime('1996-03-15', CSVUtils.DATE_FORMAT) raw_follower = RawFollower(**{ 'id': 'test', 'downloaded_on': date, 'follows': 'bodart' }) self.target.put(raw_follower) stored = self.target.get('test') assert stored is not None assert stored.follows == ['bodart'] assert stored.downloaded_on == date assert not stored.is_private def test_update_raw_follower(self): date = datetime.strptime('1996-03-15', CSVUtils.DATE_FORMAT) raw_follower = RawFollower(**{ 'id': 'test', 'downloaded_on': date, 'follows': 'bodart' }) self.target.put(raw_follower) raw_follower = RawFollower(**{ 'id': 'test', 'downloaded_on': date, 'follows': 'the_commander' }) self.target.put(raw_follower) stored = self.target.get('test') assert stored is not None assert 'bodart' in stored.follows assert 'the_commander' in stored.follows assert stored.downloaded_on == date def test_get_non_existent_raw_follower(self): with self.assertRaises(NonExistentRawFollowerError) as context: _ = self.target.get('test') assert context.exception is not None assert context.exception.message == "There is no raw follower with id 'test' in the database." def test_finish_candidate_check_if_was_loaded(self): self.target.finish_candidate('test') assert self.target.candidate_was_loaded('test') def test_candidate_was_loaded_false(self): assert not self.target.candidate_was_loaded('test') def test_get_candidates_followers_ids(self): for i in range(20): self.target.put(RawFollower(**{'id': i, 'follows': 'bodart'})) result = self.target.get_candidate_followers_ids('bodart') assert len(result) == 20 assert {i for i in range(20)} == result def test_put_public_on_private_user_stays_private(self): private_follower = RawFollower(**{'id': 'test', 'is_private': True}) self.target.put(private_follower) public_follower = RawFollower(**{'id': 'test'}) self.target.put(public_follower) stored = self.target.get('test') assert stored is not None assert stored.is_private def test_tag_as_private_ok(self): public_follower = RawFollower(**{'id': 'test'}) self.target.put(public_follower) self.target.tag_as_private(public_follower) stored = self.target.get('test') assert stored.is_private def test_get_public_users(self): private_follower = RawFollower(**{'id': 'test_1', 'is_private': True}) self.target.put(private_follower) public_follower = RawFollower(**{'id': 'test_2'}) self.target.put(public_follower) stored = self.target.get_public_users() assert stored is not None assert stored == {'test_2'} def test_get_public_users_empty(self): # This should never happen anyway private_follower = RawFollower(**{'id': 'test_1', 'is_private': True}) self.target.put(private_follower) stored = self.target.get_public_users() assert not stored def test_get_all_with_cursor(self): # Add many followers for i in range(0, 20): self.target.put(RawFollower(**{'id': i})) # Get first 10 first_10 = self.target.get_all_with_cursor(0, 10) assert len(first_10) == 10 for follower in first_10: assert follower['id'] < 10 # Get last 10 last_10 = self.target.get_all_with_cursor(10, 10) assert len(last_10) == 10 for follower in last_10: assert 10 <= follower['id'] < 20 # Check there are no overlaps assert {follower['id'] for follower in last_10 }.intersection({follower['id'] for follower in first_10}) == set() def test_get_following_with_cursor(self): # Add many followers for i in range(0, 20): if i % 2 == 0: follower = RawFollower(**{'id': i, 'follows': 'bodart'}) else: follower = RawFollower(**{'id': i, 'follows': 'the_commander'}) self.target.put(follower) # Get first 10 first_10 = self.target.get_following_with_cursor('bodart', 0, 100) assert len(first_10) == 10 assert {follower['id'] for follower in first_10 } == {i for i in range(0, 20) if i % 2 == 0} # Check there are only 10 next_followers = self.target.get_following_with_cursor( 'bodart', 10, 10) assert len(next_followers) == 0 def test_get_following_with_cursor_non_existent_candidate_raises_exception( self): with self.assertRaises(NoDocumentsFoundError) as context: _ = self.target.get_following_with_cursor('bodart', 0, 100) assert context.exception is not None message = 'No documents found on collection raw_followers with query screen_name=bodart.' assert context.exception.message == message
def setUp(self) -> None: super(TestRawFollowerDAO, self).setUp() Mongo().db = mongomock.database.Database(mongomock.MongoClient(), 'elections', _store=None) self.target = RawFollowerDAO()
def update_followers_vector(cls, user, data): """ For every user, update their rt_vector. """ RawFollowerDAO().update_first({'_id': user}, data)