Ejemplo n.º 1
0
 def __init__(self, serializer = None, endpoint = None, topic = None, tweetsPerLine = None):
    """
    """
    self.kafka = KafkaTransceiver(endpoint)
    self.topic = topic
    self.tweetsPerLine = tweetsPerLine
    TweetStore.__init__(self, serializer)
    print("created KTS, tweetsPerLine %d" % self.tweetsPerLine)
Ejemplo n.º 2
0
   p.add_argument('--broker', dest='bk_endpt', metavar='ENDPOINT', default='localhost:9092'
                  , help='kafka broker endpoint')
   p.add_argument('--topic', dest='topic', default='harassers'
                  , help='Kafka topic from which tweets are read')

   args = p.parse_args()
   pp = pprint.PrettyPrinter(indent=4)
   pp.pprint(args)

   execfile('./creds.py')
   auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
   auth.set_access_token(access_token, access_token_secret)
   api = tweepy.API(auth_handler=auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
   me = api.me().screen_name

   k = KafkaTransceiver(args.bk_endpt)

   def quit(signum, frame):
      k.close()
      exit()
   signal.signal(signal.SIGINT, quit)
   signal.signal(signal.SIGTERM, quit)

   while True:
      m = k.recv(args.topic)
      pp.pprint(m)
      m = json.loads(m)
      try:
         api.create_block(m['author'])
         print('@%s has blocked @%s for tweeting "%s"' % (me, m['author'], m['text']))
      except tweepy.TweepError as e:
Ejemplo n.º 3
0
         # add text w/ stop words removed
         lambda t: (t[0], t[1], remove_stopwords(t[1]))
      ).map(
         # pprint() can only handle ascii, it seems
         lambda t: [ _.encode('ascii','ignore') for _ in t ]
      )

   tweets = KafkaUtils.createDirectStream(ssc, [ 'tweets' ], { "metadata.broker.list": args.bk_endpt })

   offset = None if not args.reload else { TopicAndPartition('harassing-tweets', 0): 0L }
   harassing_tweets = KafkaUtils.createDirectStream(ssc, [ 'harassing-tweets' ]
                                                    , { "metadata.broker.list": args.bk_endpt }
                                                    , fromOffsets = offset)

   c = RemoteTweetClassifier(args.cf_endpt)
   k = KafkaTransceiver(args.bk_endpt)

   tweets.count().pprint()
   preprocess(tweets).filter(
      lambda t: c.isHarassingTweet(t[2])
   ).map(
      lambda t: (k.xmit('harassers', json.dumps({ "author": t[0], "text": t[1] })), t)[1]
   ).pprint()

   harassing_tweets.count().pprint()
   preprocess(harassing_tweets).map(
         # the classifier looks at tweets as bag of word documents,
         # and it doesn't like to update its corpus with documents
         # it already knows about; sorting the words lets it uniquify
         lambda t: (t[0], t[1], ' '.join(sorted(t[1].split())))
   ).foreachRDD(
Ejemplo n.º 4
0
class KafkaTweetStore(TweetStore):
   """
   Store tweets in a Kafka log
   """

   def __init__(self, serializer = None, endpoint = None, topic = None, tweetsPerLine = None):
      """
      """
      self.kafka = KafkaTransceiver(endpoint)
      self.topic = topic
      self.tweetsPerLine = tweetsPerLine
      TweetStore.__init__(self, serializer)
      print("created KTS, tweetsPerLine %d" % self.tweetsPerLine)

   def message(self, m):
      if self.tweetsPerLine is not None:
         sys.stdout.write(m)
         sys.stdout.flush()

   def close(self):
      """
      Close the store.
      """
      if self.kafka is None:
         return
      self._closing = True
      self._logEol()
      self.serializer.closing()
      self.kafka.close()
      self.kafka = None
      self.nTweets = 0
      self._closing = False

   def _logEol(self):
      if self.kafka is not None:
	  self.message("%d tweets\n" % self.nTweets)

   def _logTweet(self):
      self.message('.')
      if self.tweetsPerLine is not None and self.nTweets % self.tweetsPerLine == 0:
         self._logEol()

   def write(self, s):
      """
      write() makes no sense for Kafka,
      where messages are atomic units and
      so don't require bytes to mark tweets
      """
      pass

   def writeTweet(self,  tweet):
      """
      Write a tweet to the store.
      """
      if self._closing:
         print("writing to closing tweet store:", ''.join(traceback.format_stack()))
      self.nTweets += 1
      self.totTweets += 1
      self.totBytes += len(tweet)
      self.kafka.xmit(self.topic, tweet)
      self._logTweet()