Example #1
0
def do_tweets_replication(source_collection,
                          target_collection,
                          start_date,
                          end_date=None,
                          config_fn=None):
    dbm_source = DBManager(collection=source_collection, config_fn=config_fn)
    dbm_target = DBManager(collection=target_collection, config_fn=config_fn)
    query = {'created_at_date': {'$gte': start_date}}
    if end_date:
        query['created_at_date'].update({'$lte': end_date})
    tweets_to_replicate = dbm_source.find_all(query)
    total_tweets = tweets_to_replicate.count()
    logging.info('Replicating {0:,} tweets'.format(total_tweets))
    max_batch = BATCH_SIZE if total_tweets > BATCH_SIZE else total_tweets
    processing_counter = total_segs = 0
    tweets_to_insert = []
    for tweet in tweets_to_replicate:
        start_time = time.time()
        processing_counter += 1
        tweets_to_insert.append(tweet)
        if len(tweets_to_insert) >= max_batch:
            logging.info('Inserting tweets in the target collection...')
            dbm_target.insert_many(tweets_to_insert)
            tweets_to_insert = []
        total_segs = calculate_remaining_execution_time(
            start_time, total_segs, processing_counter, total_tweets)
Example #2
0
def export_tweets_to_json(collection,
                          output_fn,
                          config_fn=None,
                          stemming=False,
                          lang=None,
                          banned_accounts=[],
                          exclude_rts=False):
    if exclude_rts:
        query = {'type': {'$ne': 'retweet'}}
    else:
        query = {}
    if lang:
        query.update({'lang': {'$eq': lang}})
    projection = {
        '_id': 0,
        'id': 1,
        'complete_text': 1,
        'created_at_date': 1,
        'quote_count': 1,
        'reply_count': 1,
        'retweet_count': 1,
        'favorite_count': 1,
        'entities.hashtags': 1,
        'entities.user_mentions': 1,
        'sentiment.score': 1,
        'lang': 1,
        'user.screen_name': 1,
        'comunidad_autonoma': 1
    }
    if stemming:
        stemmer = SnowballStemmer('spanish')
    else:
        stemmer = None
    PAGE_SIZE = 70000
    page_num = 0
    records_to_read = True
    tweets_to_save = []
    while records_to_read:
        page_num += 1
        pagination = {'page_num': page_num, 'page_size': PAGE_SIZE}
        logging.info('Retrieving tweets...')
        dbm = DBManager(collection=collection, config_fn=config_fn)
        tweets = list(dbm.find_all(query, projection, pagination=pagination))
        total_tweets = len(tweets)
        logging.info('Found {:,} tweets'.format(total_tweets))
        if total_tweets == 0:
            break
        tweets_to_save.extend(
            process_tweets(tweets, stemming, stemmer, banned_accounts))
    with open(output_fn, 'a', encoding='utf-8') as f:
        f.write('[')
        for idx, tweet in enumerate(tweets_to_save):
            if idx < len(tweets_to_save) - 1:
                f.write('{},\n'.format(json.dumps(tweet, ensure_ascii=False)))
            else:
                f.write('{}\n'.format(json.dumps(tweet, ensure_ascii=False)))
        f.write(']')
Example #3
0
def retweet_impact_analysis(collection, config_fn):
    dbm = DBManager(collection=collection, config_fn=config_fn)
    filter_query = {
        'retweeted_status': {
            '$exists': 1
        },  # it must be a retweet
        'in_reply_to_status_id_str': {
            '$eq': None
        },  # it must not be a reply
        'is_quote_status': False  # it must not be a quote
    }
    fields_to_retrieve = {
        '_id': 0,
        'user.screen_name': 1,
        'retweeted_status.id': 1,
        'retweeted_status.user.screen_name': 1
    }
    tweets = list(dbm.find_all(filter_query, fields_to_retrieve))
    df = pd.DataFrame()
    for tweet in tweets:
        df = df.append(
            {
                'user_screen_name':
                tweet['user']['screen_name'],
                'retweeted_status_id':
                tweet['retweeted_status']['id'],
                'retweeted_status_user_screen_name':
                tweet['retweeted_status']['user']['screen_name'],
            },
            ignore_index=True)

    d_retweeted_tweets = df.groupby([
        'retweeted_status_user_screen_name'
    ])['retweeted_status_id'].nunique().to_dict()
    d_retweeting_users = df.groupby([
        'retweeted_status_user_screen_name'
    ])['user_screen_name'].nunique().to_dict()

    ri_df = pd.DataFrame()
    ri_df['retweeted_user_screen_name'] = df[
        'retweeted_status_user_screen_name']
    ri_df['retweeted_tweets'] = df.retweeted_status_user_screen_name.map(
        d_retweeted_tweets)
    ri_df['retweeting_users'] = df.retweeted_status_user_screen_name.map(
        d_retweeting_users)
    ri_df['retweet_impact'] = ri_df['retweeted_tweets'] * np.log(
        ri_df['retweeting_users'])
    ri_df = ri_df.sort_values(by=['retweet_impact'],
                              ascending=False).drop_duplicates()
    ri_df['retweet_impact'] = np.log10(ri_df['retweet_impact'])
    ri_df = ri_df.replace([np.inf, -np.inf], np.nan).dropna()

    return ri_df
Example #4
0
def export_sentiment_sample(sample_size,
                            collection,
                            config_fn=None,
                            output_filename=None,
                            lang=None):
    current_path = pathlib.Path(__file__).resolve()
    project_dir = current_path.parents[1]
    dbm = DBManager(collection=collection, config_fn=config_fn)
    query = {}
    projection = {
        '_id': 0,
        'id': 1,
        'user.screen_name': 1,
        'complete_text': 1,
        'sentiment.score': 1,
        'created_at_date': 1,
        'lang': 1
    }
    seed(1)
    logging.info('Retrieving tweets...')
    tweets = dbm.find_all(query, projection)
    total_tweets = tweets.count()
    logging.info('Found {} tweets'.format(total_tweets))
    if not output_filename:
        output_filename = 'sentiment_analysis_sample.csv'
    output_file = os.path.join(project_dir, 'data', output_filename)
    logging.info('Processing and saving tweets into {}'.format(output_file))
    sample_size = int(sample_size)
    saved_tweets = 0
    tweets_by_date = defaultdict(int)
    MAX_TWEETS_BY_DATE = 6
    with open(output_file, 'w') as csv_file:
        csv_writer = csv.DictWriter(
            csv_file, fieldnames=['id', 'date', 'user', 'text', 'score'])
        csv_writer.writeheader()
        for tweet in tweets:
            if lang and tweet['lang'] != lang:
                continue
            if random() > 0.5:
                if tweets_by_date[
                        tweet['created_at_date']] <= MAX_TWEETS_BY_DATE:
                    saved_tweets += 1
                    tweets_by_date[tweet['created_at_date']] += 1
                    csv_writer.writerow({
                        'id': tweet['id'],
                        'date': tweet['created_at_date'],
                        'user': tweet['user']['screen_name'],
                        #'lang': tweet['lang'],
                        'text': tweet['complete_text'],
                        'score': tweet['sentiment']['score']
                    })
            if saved_tweets == sample_size:
                break
Example #5
0
def export_tweets(collection,
                  output_path,
                  config_fn=None,
                  start_date=None,
                  end_date=None):
    dbm = DBManager(collection=collection, config_fn=config_fn)
    query = {}
    if start_date and end_date:
        query.update({
            '$and': [{
                'created_at_date': {
                    '$gte': start_date
                }
            }, {
                'created_at_date': {
                    '$lte': end_date
                }
            }]
        })
    elif start_date:
        query.update({'created_at_date': start_date})
    elif end_date:
        query.update({'created_at_date': end_date})
    projection = {
        '_id': 0,
        'id': 1,
        'user.screen_name': 1,
        'complete_text': 1,
        'sentiment.score': 1,
        'created_at_date': 1,
        'lang': 1,
        'retweet_count': 1,
        'favorite_count': 1,
        'reply_count': 1,
        'type': 1,
        'quoted_status': 1
    }
    logging.info('Retrieving tweets...')
    tweets = dbm.find_all(query, projection)
    total_tweets = tweets.count()
    logging.info('Found {} tweets'.format(total_tweets))
    if start_date and end_date:
        output_fn = f'tweets_{start_date}_{end_date}.csv'
    elif start_date:
        output_fn = f'tweets_{start_date}.csv'
    elif end_date:
        output_fn = f'tweets_{end_date}.csv'
    else:
        output_fn = 'tweets.csv'
    output_fn = output_path + output_fn
    output_header = ['id', 'type', 'date', 'user', 'text', 'retweets', \
                     'favorites', 'replies', 'original_tweet']
    save_tweets_in_csv_file(tweets, output_fn, output_header)
Example #6
0
def do_export_users(collection, config_file=None, output_filename=None):
    project_dir = pathlib.Path(__file__).parents[1].resolve()
    if not output_filename:
        output_filename = 'users.jsonl'
    output = os.path.join(project_dir, 'data', output_filename)
    dbm = DBManager(collection=collection, config_fn=config_file)
    query = {'$and': [{'prediction': {'$eq': None}}, {'exists': 1}]}
    projection = {
        '_id': 0,
        'id': 1,
        'name': 1,
        'screen_name': 1,
        'description': 1,
        'lang': 1,
        'img_path': 1
    }
    logging.info('Retrieving users...')
    users = list(dbm.find_all(query, projection))
    total_users = len(users)
    logging.info('Found {} users'.format(total_users))
    accepted_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.JPG', '.JPEG',
                           '.PNG', '.BMP')
    with open(output, 'w') as f:
        for user in users:
            if 'prediction' in user:
                logging.info('Found field prediction, ignoring user {}'.format(
                    user['screen_name']))
                continue
            if 'img_path' not in user:
                logging.info('User {} does not have img_path field'.format(
                    user['screen_name']))
                continue
            if user['img_path'] == '[no_img]':
                logging.info('User {} has img_path=[no_img]'.format(
                    user['screen_name']))
                continue
            if not user['img_path'].endswith(accepted_extensions):
                logging.info('User {} has image with extension {}'.format(
                    user['screen_name'], user['img_path']))
                continue
            try:
                img_path = os.path.join(project_dir, user['img_path'])
                check_user_profile_image(img_path)
                logging.info('Exporting user: {}'.format(user['screen_name']))
                f.write("{}\n".format(json.dumps(user)))
            except Exception as e:
                logging.warning(
                    'Error when resizing {0}\nThe error message is: {1}\n'.
                    format(img_path, e))
    logging.info('Process finished, output was saved into {}'.format(output))
Example #7
0
def export_user_sample_to_csv(query,
                              sample_size,
                              collection,
                              randomize=True,
                              output_filename=None,
                              config_fn=None):
    """
    Export a sample of users to a csv file
    """
    if not output_filename:
        output_filename = 'user_sample.csv'
    output = os.path.join('..', 'data', output_filename)
    dbm = DBManager(collection=collection, config_fn=config_fn)
    query_filter = query
    projection = {
        '_id': 0,
        'id_str': 1,
        'screen_name': 1,
        'description': 1,
        'location': 1,
        'comunidad_autonoma': 1
    }
    print('Getting sample of users, please wait...')
    users = list(dbm.find_all(query_filter, projection))
    total_users = len(users)
    print('Found {} users'.format(total_users))
    seed(1)
    saved_users = 0
    with open(output, 'w') as f:
        headers = [
            'id_str', 'screen_name', 'description', 'location',
            'comunidad_autonoma'
        ]
        csv_writer = csv.DictWriter(f, fieldnames=headers)
        csv_writer.writeheader()
        for user in users:
            if randomize:
                if random() > 0.5:
                    saved_users += 1
                    print('Saving user: {}'.format(user['screen_name']))
                    csv_writer.writerow(user)
            else:
                saved_users += 1
                print('Saving user: {}'.format(user['screen_name']))
                csv_writer.writerow(user)
            if saved_users >= sample_size:
                break
Example #8
0
def do_collection_merging(master_collection,
                          collections_to_merge,
                          config_fn=None):
    dbm_master = DBManager(collection=master_collection, config_fn=config_fn)
    for collection in collections_to_merge:
        logging.info('Merging collection {0} into {1}'.format(
            collection, master_collection))
        dbm_collection_to_merge = DBManager(collection=collection,
                                            config_fn=config_fn)
        tweets = dbm_collection_to_merge.find_all()
        logging.info('Trying to insert {0:,} tweets'.format(tweets.count()))
        try:
            ret_insertions = dbm_master.insert_many_tweets(tweets,
                                                           ordered=False)
            insertion_counter = ret_insertions.inserted_ids
            logging.info('{0:,} new tweets were inserted into the collection {1}'.\
                         format(insertion_counter, master_collection))
        except Exception as e:
            logging.error('Error when merging {}'.format(e))