def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id):
        import numpy
        import matplotlib.pyplot as plot

        plot_graphs = False

        hist = {
            'user_creation': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets_overall': {
                'data': None,
                'bins': None,
            },
            'n_tweets': None,
            'n_unique_users': None,
            'n_default_profile_image': None,
            'n_lower_than_threshold': None,
        }

        self.logger.debug("How many tweets? %d" % n_tweets)
        hist['n_tweets'] = n_tweets

        # TODO: abort if there are more than 200000 tweets.
        if n_tweets > 200000:
            return
        #
        # How many unique users?
        #
        n_unique_users = len(users)
        self.logger.debug("How many unique users? %d" % n_unique_users)
        hist['n_unique_users'] = n_unique_users

        ######
        sec_title = "Histogram of user creation dates?"
        #

        tmp_dates = []
        for x in users:
            tmp_date = x['user']['created_at']
            if type(tmp_date) != float:
                tmp_date = py_utc_time2drnj_time(tmp_date)
            tmp_dates.append(tmp_date)
    #    tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users]

        (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=n_bins)

        if plot_graphs:
            bins = hist['user_creation']['bins'][:-1]
            width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0])/2
            plot.bar(bins, hist['user_creation']['data'], width=width, align='center')

            xticklabels = [time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins]

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('1.pdf', dpi=600)

        #####
        sec_title = "Histogram of number of tweets of each user in this campaign"
        tmp_counts = [int(x['n_user_tweets']) for x in users]
        #
        (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets']['bins'][:-1]
            data = hist['user_n_tweets']['data']
            width = (hist['user_n_tweets']['bins'][1] - hist['user_n_tweets']['bins'][0])/2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('2.pdf', dpi=600)

        #####
        sec_title = "What percentage of them used the default profile image?"
        #
        n_default_profile_image = 0
        for u in users:
            if u['user']['default_profile_image']:
                n_default_profile_image += 1

        hist['n_default_profile_image'] = n_default_profile_image
        self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_default_profile_image)/n_unique_users)))
        #####
        sec_title = "Histogram of tweet counts of unique users"
        tmp_counts = [int(x['user']['statuses_count']) for x in users]

        (hist['user_n_tweets_overall']['data'],
         hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts, bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets_overall']['bins'][:-1]
            data = hist['user_n_tweets_overall']['data']
            width = (hist['user_n_tweets_overall']['bins'][1] - hist['user_n_tweets_overall']['bins'][0])/2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('3.pdf', dpi=600)
        #
        sec_title = "What percentage of them have lower than 5 tweets?"
        n_lower_than_threshold = 0
        for u in users:
            if u['user']['statuses_count'] < 5:
                n_lower_than_threshold += 1

        hist['n_lower_than_threshold'] = n_lower_than_threshold
        self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_lower_than_threshold)/n_unique_users)))

        self.logger.debug(hist)

        # converting numpy.array's to normal python lists.
        for k in hist.keys():
            if type(hist[k]) == dict:
                for k2 in hist[k].keys():
                    if type(hist[k][k2]) == type(numpy.array([])):
                        hist[k][k2] = list(hist[k][k2])

        hist = {'campaign_id': campaign_id,
                'histogram': hist,
                'created_at': now_in_drnj_time()}
        return hist
    def insert_tweet(self, tweet_obj_array):

        # actual tweet insertion
        yield self.motor_column.tweets.insert(tweet_obj_array)

        for tweet_obj in tweet_obj_array:

            # build the analytics
            freq = {}

            campaign_id = tweet_obj['campaign_id']

            # strip unnecessary fields
            tweet_obj = tweet_obj['tweet']

            freq['campaigns'] = {campaign_id: 1}

        # freq['tokens'] = {'ali': 1, 'veli': 1}

            freq['hashtags'] = {}
            if 'entities' in tweet_obj and 'hashtags' in tweet_obj['entities']:
                for hashtag in tweet_obj['entities']['hashtags']:
                    if 'text' in hashtag:
                        item_key = hashtag['text']
                        if item_key in freq['hashtags']:
                            freq['hashtags'][item_key] += 1
                        else:
                            freq['hashtags'][item_key] = 1
                    else:
                        # log this missing attribute.
                        pass

            freq['mentions'] = {}
            if 'entities' in tweet_obj and 'user_mentions' in tweet_obj['entities']:
                for mention in tweet_obj['entities']['user_mentions']:
                    if 'id_str' in mention:
                        item_key = "|".join([mention['id_str'], mention['screen_name']])
                        if item_key in freq['mentions']:
                            freq['mentions'][item_key] += 1
                        else:
                            freq['mentions'][item_key] = 1
                    else:
                        # log this missing attribute.
                        pass

            freq['urls'] = {}
            if 'entities' in tweet_obj and 'urls' in tweet_obj['entities']:
                for url in tweet_obj['entities']['urls']:
                    if 'expanded_url' in url:
                        item_key = url['expanded_url']
                        if item_key in freq['urls']:
                            freq['urls'][item_key] += 1
                        else:
                            freq['urls'][item_key] = 1
                    else:
                        # log this missing attribute.
                        pass

            if 'created_at' in tweet_obj:
                # turns out that we've already transformed into drnj_time
                t = drnj_time2py_time(tweet_obj['created_at'])
            else:
                t = time.time()

            gm_t = time.gmtime(t)

            today_str = time.strftime('%Y-%m-%d', gm_t)
            hour = time.strftime('%H', gm_t)
            minute = "%04d" % (int(hour)*60 + int(time.strftime('%M', gm_t)))

            for key in freq:
                for item in freq[key].keys():
                    count = freq[key][item]

                    yield self.colls[key].update({'campaign_id': campaign_id, 'date': today_str, 'key': item},
                                      {'$inc': {('hour.%s' % hour): count, ('minute.%s' % minute): count, ('day_total'): count},
                                       '$set': {'last_updated_minute': minute}}, upsert=True)
Example #3
0
sec_title = "Histogram of user creation dates?"
#

tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users]

(hist['user_creation']['data'],
 hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=100)

if plotGraphs:
    bins = hist['user_creation']['bins'][:-1]
    width = (hist['user_creation']['bins'][1] -
             hist['user_creation']['bins'][0]) / 2
    plot.bar(bins, hist['user_creation']['data'], width=width, align='center')

    xticklabels = [
        time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x)))
        for x in bins
    ]

    plot.xticks(bins, xticklabels)
    plot.title(sec_title)
    #plot.show()
    plot.savefig('1.pdf', dpi=600)

#####
sec_title = "Histogram of number of tweets of each user in this campaign"
tmp_counts = [int(x['n_user_tweets']) for x in users]
#
(hist['user_n_tweets']['data'],
 hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=100)
Example #4
0
    def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id):
        import numpy
        import matplotlib.pyplot as plot

        plot_graphs = False

        hist = {
            'user_creation': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets_overall': {
                'data': None,
                'bins': None,
            },
            'n_tweets': None,
            'n_unique_users': None,
            'n_default_profile_image': None,
            'n_lower_than_threshold': None,
        }

        self.logger.debug("How many tweets? %d" % n_tweets)
        hist['n_tweets'] = n_tweets

        # TODO: abort if there are more than 200000 tweets.
        if n_tweets > 200000:
            return
        #
        # How many unique users?
        #
        n_unique_users = len(users)
        self.logger.debug("How many unique users? %d" % n_unique_users)
        hist['n_unique_users'] = n_unique_users

        ######
        sec_title = "Histogram of user creation dates?"
        #

        tmp_dates = []
        for x in users:
            tmp_date = x['user']['created_at']
            if type(tmp_date) != float:
                tmp_date = py_utc_time2drnj_time(tmp_date)
            tmp_dates.append(tmp_date)

    #    tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users]

        (hist['user_creation']['data'],
         hist['user_creation']['bins']) = numpy.histogram(tmp_dates,
                                                          bins=n_bins)

        if plot_graphs:
            bins = hist['user_creation']['bins'][:-1]
            width = (hist['user_creation']['bins'][1] -
                     hist['user_creation']['bins'][0]) / 2
            plot.bar(bins,
                     hist['user_creation']['data'],
                     width=width,
                     align='center')

            xticklabels = [
                time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x)))
                for x in bins
            ]

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('1.pdf', dpi=600)

        #####
        sec_title = "Histogram of number of tweets of each user in this campaign"
        tmp_counts = [int(x['n_user_tweets']) for x in users]
        #
        (hist['user_n_tweets']['data'],
         hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts,
                                                          bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets']['bins'][:-1]
            data = hist['user_n_tweets']['data']
            width = (hist['user_n_tweets']['bins'][1] -
                     hist['user_n_tweets']['bins'][0]) / 2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('2.pdf', dpi=600)

        #####
        sec_title = "What percentage of them used the default profile image?"
        #
        n_default_profile_image = 0
        for u in users:
            if u['user']['default_profile_image']:
                n_default_profile_image += 1

        hist['n_default_profile_image'] = n_default_profile_image
        self.logger.debug("%s: %0.2f%%" %
                          (sec_title, 100 *
                           (float(n_default_profile_image) / n_unique_users)))
        #####
        sec_title = "Histogram of tweet counts of unique users"
        tmp_counts = [int(x['user']['statuses_count']) for x in users]

        (hist['user_n_tweets_overall']['data'],
         hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts,
                                                                  bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets_overall']['bins'][:-1]
            data = hist['user_n_tweets_overall']['data']
            width = (hist['user_n_tweets_overall']['bins'][1] -
                     hist['user_n_tweets_overall']['bins'][0]) / 2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('3.pdf', dpi=600)
        #
        sec_title = "What percentage of them have lower than 5 tweets?"
        n_lower_than_threshold = 0
        for u in users:
            if u['user']['statuses_count'] < 5:
                n_lower_than_threshold += 1

        hist['n_lower_than_threshold'] = n_lower_than_threshold
        self.logger.debug("%s: %0.2f%%" %
                          (sec_title, 100 *
                           (float(n_lower_than_threshold) / n_unique_users)))

        self.logger.debug(hist)

        # converting numpy.array's to normal python lists.
        for k in hist.keys():
            if type(hist[k]) == dict:
                for k2 in hist[k].keys():
                    if type(hist[k][k2]) == type(numpy.array([])):
                        hist[k][k2] = list(hist[k][k2])

        hist = {
            'campaign_id': campaign_id,
            'histogram': hist,
            'created_at': now_in_drnj_time()
        }
        return hist
Example #5
0
    def insert_tweet(self, tweet_obj_array):

        # actual tweet insertion
        yield self.motor_column.tweets.insert(tweet_obj_array)

        for tweet_obj in tweet_obj_array:

            # build the analytics
            freq = {}

            campaign_id = tweet_obj['campaign_id']

            # strip unnecessary fields
            tweet_obj = tweet_obj['tweet']

            freq['campaigns'] = {campaign_id: 1}

            # freq['tokens'] = {'ali': 1, 'veli': 1}

            freq['hashtags'] = {}
            if 'entities' in tweet_obj and 'hashtags' in tweet_obj['entities']:
                for hashtag in tweet_obj['entities']['hashtags']:
                    if 'text' in hashtag:
                        item_key = hashtag['text']
                        if item_key in freq['hashtags']:
                            freq['hashtags'][item_key] += 1
                        else:
                            freq['hashtags'][item_key] = 1
                    else:
                        # log this missing attribute.
                        pass

            freq['mentions'] = {}
            if 'entities' in tweet_obj and 'user_mentions' in tweet_obj[
                    'entities']:
                for mention in tweet_obj['entities']['user_mentions']:
                    if 'id_str' in mention:
                        item_key = "|".join(
                            [mention['id_str'], mention['screen_name']])
                        if item_key in freq['mentions']:
                            freq['mentions'][item_key] += 1
                        else:
                            freq['mentions'][item_key] = 1
                    else:
                        # log this missing attribute.
                        pass

            freq['urls'] = {}
            if 'entities' in tweet_obj and 'urls' in tweet_obj['entities']:
                for url in tweet_obj['entities']['urls']:
                    if 'expanded_url' in url:
                        item_key = url['expanded_url']
                        if item_key in freq['urls']:
                            freq['urls'][item_key] += 1
                        else:
                            freq['urls'][item_key] = 1
                    else:
                        # log this missing attribute.
                        pass

            if 'created_at' in tweet_obj:
                # turns out that we've already transformed into drnj_time
                t = drnj_time2py_time(tweet_obj['created_at'])
            else:
                t = time.time()

            gm_t = time.gmtime(t)

            today_str = time.strftime('%Y-%m-%d', gm_t)
            hour = time.strftime('%H', gm_t)
            minute = "%04d" % (int(hour) * 60 + int(time.strftime('%M', gm_t)))

            for key in freq:
                for item in freq[key].keys():
                    count = freq[key][item]

                    yield self.colls[key].update(
                        {
                            'campaign_id': campaign_id,
                            'date': today_str,
                            'key': item
                        }, {
                            '$inc': {
                                ('hour.%s' % hour): count,
                                ('minute.%s' % minute): count,
                                ('day_total'): count
                            },
                            '$set': {
                                'last_updated_minute': minute
                            }
                        },
                        upsert=True)
hist['n_unique_users'] = n_unique_users

######
sec_title = "Histogram of user creation dates?"
#

tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users]

(hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=100)

if plotGraphs:
    bins = hist['user_creation']['bins'][:-1]
    width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0])/2
    plot.bar(bins, hist['user_creation']['data'], width=width, align='center')

    xticklabels = [time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins]

    plot.xticks(bins, xticklabels)
    plot.title(sec_title)
    #plot.show()
    plot.savefig('1.pdf', dpi=600)

#####
sec_title = "Histogram of number of tweets of each user in this campaign"
tmp_counts = [int(x['n_user_tweets']) for x in users]
#
(hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=100)

if plotGraphs:
    bins = hist['user_n_tweets']['bins'][:-1]
    data = hist['user_n_tweets']['data']