def categorize(stdout): # Hash categories by name categories = {} for c in Category.objects.all(): categories[c.name] = c while True: tweets = Tweet.objects.filter(category__isnull=True)[:_limit] n = len(tweets) print 'Categorizing %d tweets' % n if not n: return start = datetime.now() for tweet in tweets: data = classifier.classify_tweet({ 'text': tweet.text, 'entities': json.loads(tweet.entities) }) or [['other', 0]] tweet.category = categories[data[0][0]] tweet.score = data[0][1] tweet.save() print "Categorized %d tweets in %d seconds" % \ (n, (datetime.now() - start).seconds)
def on_status(self, status): """Categorize and save tweets with coords in a state and community.""" if status.coordinates and not Tweet.objects.filter(id_str=status.id_str): coords = Point( status.coordinates['coordinates'][0], status.coordinates['coordinates'][1] ) states = State.objects.filter(geom__contains=coords) communities = Community.objects.filter(geom__contains=coords) if states and communities: created_at = status.created_at if not created_at.tzinfo: created_at = created_at.replace(tzinfo=pytz.utc) lang = 'en' if hasattr(status, 'lang'): lang = status.lang data = classifier.classify_tweet({ 'text': status.text, 'entities': status.entities }) or [['other', 0]] tweet = Tweet( id_str=status.id_str, created_at=created_at, lang=lang, text=status.text, entities=json.dumps(status.entities), coords=coords, user_id_str=status.user.id_str, user_name=status.user.name, user_screen_name=status.user.screen_name, user_profile_image_url=status.user.profile_image_url, state=states[0], community=communities[0], category_id=self.category_ids[data[0][0]], score=data[0][1] ) tweet.save() # Update aggregate stats category_id = self.category_ids['other'] if tweet.score >= self.category_thresholds[tweet.category_id]: category_id = tweet.category_id r, created = Aggregate.objects.get_or_create( date=tweet.created_at.date(), city=tweet.community.city, community=tweet.community, category_id=category_id, defaults={'count': 1}) if not created: r.count = F('count') + 1 r.save()