def __init__(self, serializer = None, endpoint = None, topic = None, tweetsPerLine = None): """ """ self.kafka = KafkaTransceiver(endpoint) self.topic = topic self.tweetsPerLine = tweetsPerLine TweetStore.__init__(self, serializer) print("created KTS, tweetsPerLine %d" % self.tweetsPerLine)
p.add_argument('--broker', dest='bk_endpt', metavar='ENDPOINT', default='localhost:9092' , help='kafka broker endpoint') p.add_argument('--topic', dest='topic', default='harassers' , help='Kafka topic from which tweets are read') args = p.parse_args() pp = pprint.PrettyPrinter(indent=4) pp.pprint(args) execfile('./creds.py') auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth_handler=auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) me = api.me().screen_name k = KafkaTransceiver(args.bk_endpt) def quit(signum, frame): k.close() exit() signal.signal(signal.SIGINT, quit) signal.signal(signal.SIGTERM, quit) while True: m = k.recv(args.topic) pp.pprint(m) m = json.loads(m) try: api.create_block(m['author']) print('@%s has blocked @%s for tweeting "%s"' % (me, m['author'], m['text'])) except tweepy.TweepError as e:
# add text w/ stop words removed lambda t: (t[0], t[1], remove_stopwords(t[1])) ).map( # pprint() can only handle ascii, it seems lambda t: [ _.encode('ascii','ignore') for _ in t ] ) tweets = KafkaUtils.createDirectStream(ssc, [ 'tweets' ], { "metadata.broker.list": args.bk_endpt }) offset = None if not args.reload else { TopicAndPartition('harassing-tweets', 0): 0L } harassing_tweets = KafkaUtils.createDirectStream(ssc, [ 'harassing-tweets' ] , { "metadata.broker.list": args.bk_endpt } , fromOffsets = offset) c = RemoteTweetClassifier(args.cf_endpt) k = KafkaTransceiver(args.bk_endpt) tweets.count().pprint() preprocess(tweets).filter( lambda t: c.isHarassingTweet(t[2]) ).map( lambda t: (k.xmit('harassers', json.dumps({ "author": t[0], "text": t[1] })), t)[1] ).pprint() harassing_tweets.count().pprint() preprocess(harassing_tweets).map( # the classifier looks at tweets as bag of word documents, # and it doesn't like to update its corpus with documents # it already knows about; sorting the words lets it uniquify lambda t: (t[0], t[1], ' '.join(sorted(t[1].split()))) ).foreachRDD(
class KafkaTweetStore(TweetStore): """ Store tweets in a Kafka log """ def __init__(self, serializer = None, endpoint = None, topic = None, tweetsPerLine = None): """ """ self.kafka = KafkaTransceiver(endpoint) self.topic = topic self.tweetsPerLine = tweetsPerLine TweetStore.__init__(self, serializer) print("created KTS, tweetsPerLine %d" % self.tweetsPerLine) def message(self, m): if self.tweetsPerLine is not None: sys.stdout.write(m) sys.stdout.flush() def close(self): """ Close the store. """ if self.kafka is None: return self._closing = True self._logEol() self.serializer.closing() self.kafka.close() self.kafka = None self.nTweets = 0 self._closing = False def _logEol(self): if self.kafka is not None: self.message("%d tweets\n" % self.nTweets) def _logTweet(self): self.message('.') if self.tweetsPerLine is not None and self.nTweets % self.tweetsPerLine == 0: self._logEol() def write(self, s): """ write() makes no sense for Kafka, where messages are atomic units and so don't require bytes to mark tweets """ pass def writeTweet(self, tweet): """ Write a tweet to the store. """ if self._closing: print("writing to closing tweet store:", ''.join(traceback.format_stack())) self.nTweets += 1 self.totTweets += 1 self.totBytes += len(tweet) self.kafka.xmit(self.topic, tweet) self._logTweet()