def __init__(self, daemon_number):
     super(KeywordsTweetDaemon, self).__init__()
     if daemon_number == 0:
         raise ValueError("daemon_number must be 1 or higher")
     self._daemon_number = daemon_number
     self._countstore = CountStore()
     self._tweetstore = TweetStore()
class GeoTweetDaemon(StreamDaemon):
    
    def __init__(self):
        self._credentials = settings.Twitter['accounts'][0]
        self._payload = {'locations': ["-180,-90","180,90"]}
        self._countstore = CountStore()
    
    def _get_streaming_args(self):
        return self._credentials, self._payload
    
    def _payload_is_empty(self):
        return False
    
    def _on_tweet_callback(self, tweet):
        country_code = tweet.country_code
        if country_code:
            for entity_type, entities in tweet.entities.items():
                for entity_id in entities:
                    self._countstore.put(entity_id, entity_type, country_code)
Example #3
0
    def test_put_with_total_count(self):
        countstore = CountStore()
        countstore.put("entity1", "test_entity", "test_seg", total_count = 10)
        docs = self.get_all_docs()

        self.assertEqual(9, len(docs))

        expected = self.build_timeslice_docs({
            "entity_id"   : "entity1",
            "entity_type" : "test_entity",
            "segmentation": "test_seg",
            "base_count"  : 10,
            "count"       : 1,
        })
        self.assertEqual(expected, docs)

        fixture.setup_mock_time(fixture.jan_1st_2013_midday + 60*15)

        countstore.put("entity1", "test_entity", "test_seg", total_count = 15)
        docs = self.get_all_docs()

        self.assertEqual(10, len(docs))

        expected = self.build_timeslice_docs({
            "entity_id"   : "entity1",
            "entity_type" : "test_entity",
            "segmentation": "test_seg",
            "count"       : 6,
	        "base_count"  : 10,
        })
        expected[0]["count"] = 1
        expected[0]["base_count"] = 10
        last_doc = copy(expected[-1])
        last_doc["count"] = 1
        last_doc["base_count"] = 15
        last_doc["timeslice"] = fixture.jan_1st_2013_midday + 60*15
        expected.append(last_doc)

        self.assertEqual(expected, docs)
Example #4
0
def cache_top_tweets():
    #initialize stores
    ts = int(time.time())
    countstore = CountStore()
    tweetstore = TweetStore()
    cache = RedisCache(namespace=settings.TopTweetsCache["namespace"])
    countries = Geo.country_codes()
    top_tweets_cache = {}
    for country in countries:
        print "*************"
        print country
        print "*************"
        
        top_tweets = {}
        segmentation = "C" + country
        for entitytype in ["hashtag", "user_mention"]:
            top_tweets[entitytype] = []
            top_entities = countstore.get_top(entitytype, 
                                              segmentation,
                                              settings.Aggregation['top_entities'],
                                              ts)
            for entity, count in top_entities:
                data = {"text":entity, "count":count, "tweets":[]}
                tweets = top_tweets_cache.get((entitytype, entity, ts))
                if not tweets:
                    print "fetching tweets for " + str((entitytype, entity, ts))
                    segmentation = ":".join([entitytype, entity])
                    tweets = countstore.get_top("tweet", 
                                                segmentation,
                                                settings.Aggregation['top_tweets'], 
                                                ts)
                    tweets = map(lambda x: (tweetstore.get(x[0]), x[1]), tweets)
                    top_tweets_cache[(entitytype, entity, ts)] = tweets
                for tweet, count in tweets:
                    data["tweets"].append({"tweet":tweet.data, "count": count})
                top_tweets[entitytype].append(data)
        cache.put(segmentation, top_tweets)
 def __init__(self):
     self._credentials = settings.Twitter['accounts'][0]
     self._payload = {'locations': ["-180,-90","180,90"]}
     self._countstore = CountStore()
class KeywordsTweetDaemon(object):
    
    def __init__(self, daemon_number):
        super(KeywordsTweetDaemon, self).__init__()
        if daemon_number == 0:
            raise ValueError("daemon_number must be 1 or higher")
        self._daemon_number = daemon_number
        self._countstore = CountStore()
        self._tweetstore = TweetStore()
    
    def _get_streaming_args(self):
        self._credentials = settings.Twitter['accounts'][self._daemon_number]
        segs = ["C" + code for code in Geo.country_codes()]
        #We're only allowed to track 400 keywords
        lock = FileLock('/tmp/trackwords')
        with lock:
            all_hashtags = self._get_all_entities("hashtag", segs)
            all_usermentions = self._get_all_entities("user_mention", segs)
            used_hashtags, used_usermentions = map(set, self._get_used_trackwords())
            hashtags = [ht for ht in all_hashtags if not ht in used_hashtags][:200]
            usermentions = [um for um in all_usermentions if not um in used_usermentions][:200]
            self._set_used_trackwords(hashtags, usermentions)
            self._payload = {'track': hashtags + usermentions}
        return self._credentials, self._payload
    
    def _on_tweet_callback(self, tweet):
        self._tweetstore.put(tweet)
        tweet_id = tweet.original_id
        for entity_type, entities in tweet.entities.items():
            for entity in entities:
                segmentation = ":".join([entity_type, entity])
                self._countstore.put(tweet_id, "tweet", segmentation)
    
    def _get_all_entities(self, entity_type, segmentations):
        fetch = settings.Aggregation["top_entities"]
        get_top_entities_for_seg = lambda s: set(self._countstore.get_top( 
                                                        entity_type=entity_type, 
                                                        segmentation=s, 
                                                        num_to_get=fetch).keys())
        top_entities = map(get_top_entities_for_seg, segmentations)
        return list(reduce(lambda x, y : x | y, top_entities))
         
    def _payload_is_empty(self):
        return not self._payload or not self._payload['track']

    def _get_used_trackwords(self):
        used_hashtags, used_usermentions = [], []
        last_index = len(settings.Twitter["accounts"])
        indices = range(1, last_index)
        cache = RedisCache(namespace=settings.TrackwordCache["namespace"])
        for idx in indices:
            if idx == self._daemon_number:
                continue
            hashtag_key = "streamer%s:hashtags" % idx
            usermention_key = "streamer%s:usermentions" % idx
            hashtags = cache.get(hashtag_key) or []
            used_hashtags += hashtags
            usermentions = cache.get(usermention_key) or []
            used_usermentions += usermentions
        return used_hashtags, used_usermentions
    
    def _set_used_trackwords(self, hashtags, usermentions):
        hashtag_key = "streamer%s:hashtags" % self._daemon_number
        usermention_key = "streamer%s:usermentions" % self._daemon_number
        cache = RedisCache(namespace=settings.TrackwordCache["namespace"])
        cache.put(hashtag_key, hashtags)
        cache.put(usermention_key, usermentions)
        
    def _reset_used_trackwords(self):
        last_index = len(settings.Twitter["accounts"])
        indices = range(1, last_index)
        for idx in indices:
            hashtag_key = "streamer%s:hashtags" % idx
            usermention_key = "streamer%s:usermentions" % idx
            cache = RedisCache(namespace=settings.TrackwordCache["namespace"])
            cache.delete(hashtag_key)
            cache.delete(usermention_key)            

    def _filter_track(self, words):
        return [utf8(w[0]) for w in words if utf8(w[0])[:60] == utf8(w[0])]
Example #7
0
    def test_get_top(self):
        countstore = CountStore()
        countstore.put("entity1", "test_entity", "test_seg")
        countstore.put("entity2", "test_entity", "test_seg")
        countstore.put("entity2", "test_entity", "test_seg")
        actual = countstore.get_top("test_entity", "test_seg", 10)
        expected = [("entity2", 2), ("entity1", 1)]

        self.assertEqual(expected, actual)

        fixture.setup_mock_time(fixture.jan_1st_2013_midday + 3600)

        countstore.put("entity1", "test_entity", "test_seg")
        countstore.put("entity1", "test_entity", "test_seg")
        countstore.put("entity3", "test_entity", "test_seg")
        actual = countstore.get_top("test_entity", "test_seg", 10)
        expected = [("entity1", 3), ("entity2", 2), ("entity3", 1)]

        self.assertEqual(expected, actual)

        fixture.setup_mock_time(fixture.jan_1st_2013_midday + 2*3600)

        countstore.put("entity3", "test_entity", "test_seg")
        countstore.put("entity3", "test_entity", "test_seg")
        actual = countstore.get_top("test_entity", "test_seg", 10)
        expected = [("entity3", 3), ("entity1", 2)]

        self.assertEqual(expected, actual)