tweets = KafkaUtils.createDirectStream(ssc, [ 'tweets' ], { "metadata.broker.list": args.bk_endpt }) offset = None if not args.reload else { TopicAndPartition('harassing-tweets', 0): 0L } harassing_tweets = KafkaUtils.createDirectStream(ssc, [ 'harassing-tweets' ] , { "metadata.broker.list": args.bk_endpt } , fromOffsets = offset) c = RemoteTweetClassifier(args.cf_endpt) k = KafkaTransceiver(args.bk_endpt) tweets.count().pprint() preprocess(tweets).filter( lambda t: c.isHarassingTweet(t[2]) ).map( lambda t: (k.xmit('harassers', json.dumps({ "author": t[0], "text": t[1] })), t)[1] ).pprint() harassing_tweets.count().pprint() preprocess(harassing_tweets).map( # the classifier looks at tweets as bag of word documents, # and it doesn't like to update its corpus with documents # it already knows about; sorting the words lets it uniquify lambda t: (t[0], t[1], ' '.join(sorted(t[1].split()))) ).foreachRDD( lambda rdd: rdd.foreach(lambda t: (pp.pprint(t), c.addHarassingTweet(t[2]))) ) ssc.start() ssc.awaitTermination() # vim: expandtab shiftwidth=3 softtabstop=3 tabstop=3
class KafkaTweetStore(TweetStore): """ Store tweets in a Kafka log """ def __init__(self, serializer = None, endpoint = None, topic = None, tweetsPerLine = None): """ """ self.kafka = KafkaTransceiver(endpoint) self.topic = topic self.tweetsPerLine = tweetsPerLine TweetStore.__init__(self, serializer) print("created KTS, tweetsPerLine %d" % self.tweetsPerLine) def message(self, m): if self.tweetsPerLine is not None: sys.stdout.write(m) sys.stdout.flush() def close(self): """ Close the store. """ if self.kafka is None: return self._closing = True self._logEol() self.serializer.closing() self.kafka.close() self.kafka = None self.nTweets = 0 self._closing = False def _logEol(self): if self.kafka is not None: self.message("%d tweets\n" % self.nTweets) def _logTweet(self): self.message('.') if self.tweetsPerLine is not None and self.nTweets % self.tweetsPerLine == 0: self._logEol() def write(self, s): """ write() makes no sense for Kafka, where messages are atomic units and so don't require bytes to mark tweets """ pass def writeTweet(self, tweet): """ Write a tweet to the store. """ if self._closing: print("writing to closing tweet store:", ''.join(traceback.format_stack())) self.nTweets += 1 self.totTweets += 1 self.totBytes += len(tweet) self.kafka.xmit(self.topic, tweet) self._logTweet()