def do_tweets_replication(source_collection, target_collection, start_date, end_date=None, config_fn=None): dbm_source = DBManager(collection=source_collection, config_fn=config_fn) dbm_target = DBManager(collection=target_collection, config_fn=config_fn) query = {'created_at_date': {'$gte': start_date}} if end_date: query['created_at_date'].update({'$lte': end_date}) tweets_to_replicate = dbm_source.find_all(query) total_tweets = tweets_to_replicate.count() logging.info('Replicating {0:,} tweets'.format(total_tweets)) max_batch = BATCH_SIZE if total_tweets > BATCH_SIZE else total_tweets processing_counter = total_segs = 0 tweets_to_insert = [] for tweet in tweets_to_replicate: start_time = time.time() processing_counter += 1 tweets_to_insert.append(tweet) if len(tweets_to_insert) >= max_batch: logging.info('Inserting tweets in the target collection...') dbm_target.insert_many(tweets_to_insert) tweets_to_insert = [] total_segs = calculate_remaining_execution_time( start_time, total_segs, processing_counter, total_tweets)
def export_tweets_to_json(collection, output_fn, config_fn=None, stemming=False, lang=None, banned_accounts=[], exclude_rts=False): if exclude_rts: query = {'type': {'$ne': 'retweet'}} else: query = {} if lang: query.update({'lang': {'$eq': lang}}) projection = { '_id': 0, 'id': 1, 'complete_text': 1, 'created_at_date': 1, 'quote_count': 1, 'reply_count': 1, 'retweet_count': 1, 'favorite_count': 1, 'entities.hashtags': 1, 'entities.user_mentions': 1, 'sentiment.score': 1, 'lang': 1, 'user.screen_name': 1, 'comunidad_autonoma': 1 } if stemming: stemmer = SnowballStemmer('spanish') else: stemmer = None PAGE_SIZE = 70000 page_num = 0 records_to_read = True tweets_to_save = [] while records_to_read: page_num += 1 pagination = {'page_num': page_num, 'page_size': PAGE_SIZE} logging.info('Retrieving tweets...') dbm = DBManager(collection=collection, config_fn=config_fn) tweets = list(dbm.find_all(query, projection, pagination=pagination)) total_tweets = len(tweets) logging.info('Found {:,} tweets'.format(total_tweets)) if total_tweets == 0: break tweets_to_save.extend( process_tweets(tweets, stemming, stemmer, banned_accounts)) with open(output_fn, 'a', encoding='utf-8') as f: f.write('[') for idx, tweet in enumerate(tweets_to_save): if idx < len(tweets_to_save) - 1: f.write('{},\n'.format(json.dumps(tweet, ensure_ascii=False))) else: f.write('{}\n'.format(json.dumps(tweet, ensure_ascii=False))) f.write(']')
def retweet_impact_analysis(collection, config_fn): dbm = DBManager(collection=collection, config_fn=config_fn) filter_query = { 'retweeted_status': { '$exists': 1 }, # it must be a retweet 'in_reply_to_status_id_str': { '$eq': None }, # it must not be a reply 'is_quote_status': False # it must not be a quote } fields_to_retrieve = { '_id': 0, 'user.screen_name': 1, 'retweeted_status.id': 1, 'retweeted_status.user.screen_name': 1 } tweets = list(dbm.find_all(filter_query, fields_to_retrieve)) df = pd.DataFrame() for tweet in tweets: df = df.append( { 'user_screen_name': tweet['user']['screen_name'], 'retweeted_status_id': tweet['retweeted_status']['id'], 'retweeted_status_user_screen_name': tweet['retweeted_status']['user']['screen_name'], }, ignore_index=True) d_retweeted_tweets = df.groupby([ 'retweeted_status_user_screen_name' ])['retweeted_status_id'].nunique().to_dict() d_retweeting_users = df.groupby([ 'retweeted_status_user_screen_name' ])['user_screen_name'].nunique().to_dict() ri_df = pd.DataFrame() ri_df['retweeted_user_screen_name'] = df[ 'retweeted_status_user_screen_name'] ri_df['retweeted_tweets'] = df.retweeted_status_user_screen_name.map( d_retweeted_tweets) ri_df['retweeting_users'] = df.retweeted_status_user_screen_name.map( d_retweeting_users) ri_df['retweet_impact'] = ri_df['retweeted_tweets'] * np.log( ri_df['retweeting_users']) ri_df = ri_df.sort_values(by=['retweet_impact'], ascending=False).drop_duplicates() ri_df['retweet_impact'] = np.log10(ri_df['retweet_impact']) ri_df = ri_df.replace([np.inf, -np.inf], np.nan).dropna() return ri_df
def export_sentiment_sample(sample_size, collection, config_fn=None, output_filename=None, lang=None): current_path = pathlib.Path(__file__).resolve() project_dir = current_path.parents[1] dbm = DBManager(collection=collection, config_fn=config_fn) query = {} projection = { '_id': 0, 'id': 1, 'user.screen_name': 1, 'complete_text': 1, 'sentiment.score': 1, 'created_at_date': 1, 'lang': 1 } seed(1) logging.info('Retrieving tweets...') tweets = dbm.find_all(query, projection) total_tweets = tweets.count() logging.info('Found {} tweets'.format(total_tweets)) if not output_filename: output_filename = 'sentiment_analysis_sample.csv' output_file = os.path.join(project_dir, 'data', output_filename) logging.info('Processing and saving tweets into {}'.format(output_file)) sample_size = int(sample_size) saved_tweets = 0 tweets_by_date = defaultdict(int) MAX_TWEETS_BY_DATE = 6 with open(output_file, 'w') as csv_file: csv_writer = csv.DictWriter( csv_file, fieldnames=['id', 'date', 'user', 'text', 'score']) csv_writer.writeheader() for tweet in tweets: if lang and tweet['lang'] != lang: continue if random() > 0.5: if tweets_by_date[ tweet['created_at_date']] <= MAX_TWEETS_BY_DATE: saved_tweets += 1 tweets_by_date[tweet['created_at_date']] += 1 csv_writer.writerow({ 'id': tweet['id'], 'date': tweet['created_at_date'], 'user': tweet['user']['screen_name'], #'lang': tweet['lang'], 'text': tweet['complete_text'], 'score': tweet['sentiment']['score'] }) if saved_tweets == sample_size: break
def export_tweets(collection, output_path, config_fn=None, start_date=None, end_date=None): dbm = DBManager(collection=collection, config_fn=config_fn) query = {} if start_date and end_date: query.update({ '$and': [{ 'created_at_date': { '$gte': start_date } }, { 'created_at_date': { '$lte': end_date } }] }) elif start_date: query.update({'created_at_date': start_date}) elif end_date: query.update({'created_at_date': end_date}) projection = { '_id': 0, 'id': 1, 'user.screen_name': 1, 'complete_text': 1, 'sentiment.score': 1, 'created_at_date': 1, 'lang': 1, 'retweet_count': 1, 'favorite_count': 1, 'reply_count': 1, 'type': 1, 'quoted_status': 1 } logging.info('Retrieving tweets...') tweets = dbm.find_all(query, projection) total_tweets = tweets.count() logging.info('Found {} tweets'.format(total_tweets)) if start_date and end_date: output_fn = f'tweets_{start_date}_{end_date}.csv' elif start_date: output_fn = f'tweets_{start_date}.csv' elif end_date: output_fn = f'tweets_{end_date}.csv' else: output_fn = 'tweets.csv' output_fn = output_path + output_fn output_header = ['id', 'type', 'date', 'user', 'text', 'retweets', \ 'favorites', 'replies', 'original_tweet'] save_tweets_in_csv_file(tweets, output_fn, output_header)
def do_export_users(collection, config_file=None, output_filename=None): project_dir = pathlib.Path(__file__).parents[1].resolve() if not output_filename: output_filename = 'users.jsonl' output = os.path.join(project_dir, 'data', output_filename) dbm = DBManager(collection=collection, config_fn=config_file) query = {'$and': [{'prediction': {'$eq': None}}, {'exists': 1}]} projection = { '_id': 0, 'id': 1, 'name': 1, 'screen_name': 1, 'description': 1, 'lang': 1, 'img_path': 1 } logging.info('Retrieving users...') users = list(dbm.find_all(query, projection)) total_users = len(users) logging.info('Found {} users'.format(total_users)) accepted_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.JPG', '.JPEG', '.PNG', '.BMP') with open(output, 'w') as f: for user in users: if 'prediction' in user: logging.info('Found field prediction, ignoring user {}'.format( user['screen_name'])) continue if 'img_path' not in user: logging.info('User {} does not have img_path field'.format( user['screen_name'])) continue if user['img_path'] == '[no_img]': logging.info('User {} has img_path=[no_img]'.format( user['screen_name'])) continue if not user['img_path'].endswith(accepted_extensions): logging.info('User {} has image with extension {}'.format( user['screen_name'], user['img_path'])) continue try: img_path = os.path.join(project_dir, user['img_path']) check_user_profile_image(img_path) logging.info('Exporting user: {}'.format(user['screen_name'])) f.write("{}\n".format(json.dumps(user))) except Exception as e: logging.warning( 'Error when resizing {0}\nThe error message is: {1}\n'. format(img_path, e)) logging.info('Process finished, output was saved into {}'.format(output))
def export_user_sample_to_csv(query, sample_size, collection, randomize=True, output_filename=None, config_fn=None): """ Export a sample of users to a csv file """ if not output_filename: output_filename = 'user_sample.csv' output = os.path.join('..', 'data', output_filename) dbm = DBManager(collection=collection, config_fn=config_fn) query_filter = query projection = { '_id': 0, 'id_str': 1, 'screen_name': 1, 'description': 1, 'location': 1, 'comunidad_autonoma': 1 } print('Getting sample of users, please wait...') users = list(dbm.find_all(query_filter, projection)) total_users = len(users) print('Found {} users'.format(total_users)) seed(1) saved_users = 0 with open(output, 'w') as f: headers = [ 'id_str', 'screen_name', 'description', 'location', 'comunidad_autonoma' ] csv_writer = csv.DictWriter(f, fieldnames=headers) csv_writer.writeheader() for user in users: if randomize: if random() > 0.5: saved_users += 1 print('Saving user: {}'.format(user['screen_name'])) csv_writer.writerow(user) else: saved_users += 1 print('Saving user: {}'.format(user['screen_name'])) csv_writer.writerow(user) if saved_users >= sample_size: break
def do_collection_merging(master_collection, collections_to_merge, config_fn=None): dbm_master = DBManager(collection=master_collection, config_fn=config_fn) for collection in collections_to_merge: logging.info('Merging collection {0} into {1}'.format( collection, master_collection)) dbm_collection_to_merge = DBManager(collection=collection, config_fn=config_fn) tweets = dbm_collection_to_merge.find_all() logging.info('Trying to insert {0:,} tweets'.format(tweets.count())) try: ret_insertions = dbm_master.insert_many_tweets(tweets, ordered=False) insertion_counter = ret_insertions.inserted_ids logging.info('{0:,} new tweets were inserted into the collection {1}'.\ format(insertion_counter, master_collection)) except Exception as e: logging.error('Error when merging {}'.format(e))