def __init__(self, factory, destination): self.factory = factory self.destination = destination self.consumer = SimpleConsumer(self.factory, "test-group", self.destination) self.rate = PerfRate() threading.Thread.__init__(self)
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
def __init__(self, host, port, schema_path, topic, nbmsg, consumer_timeout): self.topic = topic self.nbmsg = nbmsg self.sent_msg = 0 self.host = host self.port = port self.sent = [-100] * self.nbmsg self.rcv = [-100] * self.nbmsg self.runtag = str(random.randint(10, 100000)) try: self.broker = KafkaClient("%s:%d" % (self.host, self.port)) except: raise ValueError( "KafkaClient (%s:%d) - init failed" % (self.host, self.port)) try: self.producer = SimpleProducer(self.broker) except: raise ValueError( "SimpleProducer (%s:%d) - init failed" % (self.host, self.port)) try: self.consumer = SimpleConsumer( self.broker, "testbot", topic, iter_timeout=consumer_timeout) except: raise ValueError( "SimpleConsumer (%s:%d) - init failed" % (self.host, self.port)) try: self.schema = avro.schema.parse(open(schema_path).read()) except: raise ValueError( "Prod2Cons load schema (%s) - init failed" % (schema_path))
def get_offsets(offsets_after_time_millis, conn_params=config.DEFAULT_CONN_PARAMS): curr_time = long(time.time() * 1000) for host in config.bagheera_nodes: for topic in config.topics: for partition in config.partitions: consumer = SimpleConsumer(host, conn_params['port'], conn_params['nrecs'], conn_params['bufsize']) offset = long( consumer.getOffsetsBefore(topic, partition, offsets_after_time_millis, 1)[0]) consumer.close() System.out.println( json.dumps({ 'time_millis': curr_time, 'hostname': host, 'topic': topic, 'partition': partition, 'offset': offset }))
def kafka_pull(message_queue): global g_conf global g_master_logger ret = True while True: try: if is_quit(): g_master_logger.info("thread quit: [%d]" % os.getpid()) return True random_v = random.randint(0, len(g_conf["broker_list"]) - 1) broker = g_conf["broker_list"][random_v] g_master_logger.info("use broker is [%s]" % broker) partition_set = set([0]) # client client = KafkaClient(broker) consumer = SimpleConsumer( client, g_conf["msg_group_name"], g_conf["msg_topic_name"], partitions=partition_set, auto_commit_every_n=g_conf["auto_commit_every_n"], auto_commit_every_t=g_conf["auto_commit_every_t"], fetch_size_bytes=g_conf["fetch_size_bytes"], buffer_size=g_conf["buffer_size"], max_buffer_size=g_conf["max_buffer_size"]) cnt = 0 for message in consumer: cnt += 1 if cnt % 10000 == 0: g_master_logger.info("msg consumer cnt is [%d] queue:%u" % (cnt, message_queue.qsize())) if is_quit(): consumer.stop() g_master_logger.info("thread fetch msg quit: [%d]" % os.getpid()) break value = message.message.value if value == None: g_master_logger.warning("value is none, msg is [%s]" % str(message)) continue if len(value) == 0: g_master_logger.warning("value len is 0, msg is [%s]" % str(message)) continue if check_pkg(value) == False: continue message_queue.put(message) except Exception, e: g_master_logger.error( "work error, exception is [%s], traceback is [%s]" % (e, traceback.format_exc())) time.sleep(5) continue
def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0
def dataConsumer(topic, group='default', count=1, dateStr=''): kafka_consumer = SimpleConsumer(KafkaClient(MasterPublicIP + ":9092"), \ group, topic, max_buffer_size=MAX_BUFFER_SIZE) messages = kafka_consumer.get_messages(count=count) dataList = [] for message in messages: dataList.append(message.message.value) if len(dataList) > 0: flush2HDFS(dataList, dateStr)
def main(): client = KafkaClient("localhost:9092") consumer = SimpleConsumer(client, "test-group", "twitter_raw") consumer.seek(0,2) num = 0 for message in consumer: print "redis publish:", num num+=1 try: data_depickled = pickle.loads(message.message.value.decode('utf-8')) except Exception, e: continue # print data_depickled # { # 'text':'@_LulaMoore me hamas perra', # 'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49), # 'source':u'Twitter Web Client', # 'lang:':u'es', # 'place':{ # 'country_code':u'AR', # 'coordinates':[ # [ # -68.176283, # -38.984724 # ], # [ # -68.176283, # -38.921051 # ], # [ # -68.015162, # -38.921051 # ], # [ # -68.015162, # -38.984724 # ] # ] # }, # 'user':{ # 'statuses_count':15067, # 'name':u'Dama negra *\uffe6*', # 'friends_count':390, # 'created_at':datetime.datetime(2014, 3, 15,2,37, 10), # 'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg', # 'followers_count':384, # 'id':2390242428 # }, # 'id':652628813935980544 # } ### process data here ### # text = data_depickled['text'] filtered_data = data_filter(data_depickled) data_pickled = pickle.dumps(filtered_data) redis.publish('tweets_processed', data_pickled)
def run(self): client = KafkaClient(self.bootstrap_server, client_id='commandline') consumer = SimpleConsumer(client, self.group, self.topic, auto_commit_every_n=1, buffer_size=160, auto_commit=True) for message in consumer: now = datetime.now() print("%s: %s" % (now, message)) consumer.commit()
def __init__(self, conn_pool, topic, group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue
def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000,auto_commit=False) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" %(topic) self.cached_path = "/user/AdReport/%s/cached" %(topic) self.topic = topic self.group = group self.block_cnt = 0
def run(self): client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092") consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part) consumer.seek(0,0) while True: message = consumer.get_message(True,60) self.__offset = message.offset print message.message.value
def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name): print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name) consumer = SimpleConsumer(self.client, topic_group, topic_name) consumer.seek(0,2) for message in consumer: message = parse_json(message) print "=============" + str(message) + "============" message_consume_function(message) print "called message consume function"
def main(): client = KafkaClient("localhost:9092") consumer = SimpleConsumer(client, "test-group", "twitter_raw") consumer.seek(0,2) for message in consumer: # data_deserialized = str.decode(message.message.value) data_depickled = pickle.loads(message.message.value.decode('utf-8')) # print str(data_depickled).decode('string_escape') print data_depickled
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath) self.block_cnt += 1 os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path, "w")
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-out-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( id = input['id'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, raw_html = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'userId': crawled['attrs']['userId'], 'org': crawled['attrs']['org'], 'domain': crawled['attrs']['domain'], 'url': crawled['url'] } self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0
def spiderIdle(self, spider): consumer = SimpleConsumer(self.kafka_conn, "test", "commands") for msg in consumer.get_messages(): print msg.message.value if msg.message.value == spider.name + '_stop': print 'stop' spider.spider_pause() #spider.close(spider,'ok') #self.scrapy.engine.close_spider(spider, 'closespider_itemcount') if msg.message.value == spider.name + '_start': #self.scrapy.engine.scraper.open_spider(spider) spider.spider_resume()
def spiderIdle(self, spider): consumer = SimpleConsumer(self.kafka_conn, "test", "commands") for msg in consumer.get_messages(): print msg.message.value if msg.message.value == spider.name + "_stop": print "stop" spider.spider_pause() # spider.close(spider,'ok') # self.scrapy.engine.close_spider(spider, 'closespider_itemcount') if msg.message.value == spider.name + "_start": # self.scrapy.engine.scraper.open_spider(spider) spider.spider_resume()
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = {'source': 'datawake-visited', 'domain': domain} self.emit([ url, '', '', '', html, timestamp, context['source'], context ]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-in-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000) self.consumer.seek(0,2) # move to the tail of the queue except: self.log("CrawlerSpout initialize error",level='error') self.log(traceback.format_exc(),level='error') raise
def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator)
def run(self): client = KafkaClient( "10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092") consumer = SimpleConsumer(client, "test-group", "guantest") for message in consumer: print(message.message.value)
def main(): kafka = KafkaClient("localhost:9092") print("Consumer established connection to kafka") consumer = SimpleConsumer(kafka, "my-group", "test") for message in consumer: # This will wait and print messages as they become available print(message)
def __init__(self, topic, hosts=None, log_level=logging.WARNING): hosts = hosts or "localhost:9092" self.group = "kafque" self.topic = "{}_{}".format(self.group, topic) self.client = KafkaClient(hosts) self.client.ensure_topic_exists(str(self.topic)) self.consumer = SimpleConsumer( self.client, str(self.group), str(self.topic), auto_commit=False) self.consumer.provide_partition_info() self.consumer.fetch_last_known_offsets() self.logger = setup_logger(__name__, level=log_level) self.failed_queue = None if self.topic != "{}_failed".format(self.group): self.failed_queue = FailedQueue( hosts=hosts, log_level=logging.ERROR)
def from_crawler(cls, crawler, *args, **kwargs): spider = super(ListeningKafkaSpider, cls).from_crawler(crawler, *args, **kwargs) if not hasattr(spider, 'topic') or not spider.topic: spider.topic = '%s-starturls' % spider.name hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = crawler.settings.get( 'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue spider.consumer = SimpleConsumer(_kafka, consumer_group, spider.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic) return spider
def run(self): client = KafkaClient("172.17.8.101:9092") consumer = SimpleConsumer(client, "test-group", "topic") batch_size = 300 global_counter = 0 counter = 0 batch = BatchStatement() for message in consumer: if counter >= batch_size: session.execute(batch) batch = BatchStatement() counter = 0 temp = yaml.load(message[1][3]) # print temp global_counter += 1 print global_counter prepared = session.prepare(""" INSERT INTO testkeyspace.meter_data (timestamp, id, P_1, P_2, P_3, Q_1, Q_2, Q_3) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """) batch.add(prepared, (temp["timestamp"], uuid.UUID( temp["id"]), temp["P_1"], temp["P_2"], temp["P_3"], temp["Q_1"], temp["Q_2"], temp["Q_3"])) counter += 1
class Consumer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.topic = "steps_data_part4" self.consumer_group = 's3_consumer' self.consumer = SimpleConsumer(self.client, self.consumer_group, self.topic) def consume_message(self): while True: timestamp = time.strftime('%Y%m%d%H%M%S') temp_file_name = "%s_%s_%s.dat" %(self.topic, self.consumer_group, timestamp) temp_file = open("/home/ubuntu/rankMyStep/kafka/"+temp_file_name,"w") messages = self.consumer.get_messages(count=1000, block=False) for msg in messages: print msg.message.value + "\n" temp_file.write(msg.message.value + "\n") self.save_to_s3(temp_file_name) def save_to_s3(self, file_name): mybucket = "anurag-raw-data-store" aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') s3_client = boto3.client('s3') s3_client.upload_file("/home/ubuntu/rankMyStep/kafka/"+file_name, mybucket,"rankmysteps/"+file_name) os.remove("/home/ubuntu/rankMyStep/kafka/"+file_name)
def __init__(self, info): self.host = info['attributes']['host'] self.group = info['attributes']['group'] self.topic = info['attributes']['topic'] self.client = KafkaClient(self.host) self.consumer = SimpleConsumer(client, self.group, self.topic)
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
def get_message() : try : kconn = KafkaClient(kafka_producer.hosts , timeout = 10) getter = SimpleConsumer(kconn , 'test_group', kafka_producer.topic) #getter.seek(0, 0) while True: try: messages = getter.get_messages(200,timeout=3) if messages: logging.info('get message from kafka done'+str(decode(messages))) import time time.sleep(0.1) except BaseException ,e: logging.error(str(e)) except BaseException , e : logging.error(str(e) + 'get message from kafka failed')
def run(self): client = KafkaClient("vsu-01:9092") consumer = SimpleConsumer(client, "test-group", "my.price") for message in consumer: print(message)
def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue
def run(self): client = None consumer = None try: prev = None # print("Starting Kafka Client") # print("Kafka topic: {}").format(self.topic) print get_kafka_hosts() client = KafkaClient(hosts=get_kafka_hosts()) consumer = SimpleConsumer(client=client, group=self.groupName.encode( 'ascii', 'ignore'), topic=self.topic, iter_timeout=5) consumer.seek(0, 1) print '[Kafka Consumer] START' print 'Topic: {}'.format(self.topic) print 'Listening incoming message...' print '=========================================================' # print("Listening kafka message...") while self.stopCpu is False: for message in consumer.get_messages(count=5, block=False): if self.stopCpu is True: # print("Kafka Consumer Listening Stopped") break if message: offset = message.offset value = message.message.value print 'msg: {0}, offset: {1}'.format(value, offset) if len(value) > 0: # chartdata = [] # j_val = json.loads(value) # j_val['offset'] = offset # chartdata.append(j_val) # print("destination => ws"+str(self.pid)) # self.parentOj.emit("ws"+str(self.type), chartdata) # self.parentOj.emit(self.topic, value) self.parentOj.emit("ws" + str(self.pid), value) print '[Kafka Consumer] STOP' print 'Topic: {}'.format(self.topic) print 'Stop listening...' print '========================================================' # print("Listening kafka Stopped") consumer.stop() client.close() except Exception as e: consumer.stop() client.close()
def listen(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) # print client.topic_partitions() consumer = SimpleConsumer(client, self.consumer_name, self.topic_name) for message in consumer: value = message.message.value print value
def register_consumer(self, callback, parse_json, topic_group, topic_name): consumer = SimpleConsumer(self.client, topic_group, topic_name, max_buffer_size=None) consumer_thread = ConsumerThread(consumer, callback, parse_json) print "Starting new subscriber for topic " + topic_name + ' with group ' + topic_group consumer_thread.start()
def __init__(self, cache): threading.Thread.__init__(self) self.kafka = KafkaClient(self.kafkaHost) self.consumer = SimpleConsumer(self.kafka, "test-group", "collector") self.cache = cache
def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer( self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % ( self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str( self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % ( self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
def run(self): #client = KafkaClient("localhost:9092") client = KafkaClient("kafka_host:9092") # consumer = SimpleConsumer(client, "test-group", "my-topic") consumer = SimpleConsumer(client, "python-group", "test") for message in consumer: print(message)
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = { 'source': 'datawake-visited', 'domain': domain } self.emit([url, '', '', '', html, timestamp, context['source'], context]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" %(topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0
class CrawlerSpout(Spout): group = 'datawake-crawler-in-consumer'.encode() def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-in-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000) self.consumer.seek(0,2) # move to the tail of the queue except: self.log("CrawlerSpout initialize error",level='error') self.log(traceback.format_exc(),level='error') raise def next_tuple(self): """ input message: json.dumps(dict( id = 'abcdefg', #TODO generate UUID, appid = self.appid, url = url, priority = 50, depth = 0, attrs = dict( userId = context['userId'], org = context['org'], domain = context['domain'] ) )) :return: """ try: for message in self.consumer: to_crawl = json.loads(message) self.emit([to_crawl]) except: self.log(traceback.format_exc(),level='error')
def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0
def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-out-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer(self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True
def _init(self, topics): ret = False while not ret: ret = self.create_newpath(kafka_consts.CONSUMER_PATH + '/' + self.consumer_group) if not ret: sleep(1) ret = False while not ret: ret = self.create_newpath(kafka_consts.CONSUMER_PATH + '/' + self.consumer_group + '/ids') if not ret: sleep(1) self.register() self.get_consumer_list() self.populate_broker_info() temptopics = [x.strip() for x in topics] self.topics = [] for t in temptopics: if t != '' and t not in self.topics: self.topics.append(t) if not self.topics: raise ValueError('no topics passed') ret = False broker_ports = [] with self.lock: for brid in self.broker_details: broker_port = self.broker_details[brid] broker_ports.append('{}:{}'.format(broker_port['host'],broker_port['port'])) self.kafka_client = nsclient(broker_ports) self.topic_part_ids = {} for topic in topics: pids = self.kafka_client.get_partition_ids_for_topic(topic) self.topic_part_ids[topic] = pids self.consumed = {} self.rebalance_consumers() try: topic_partitions = {t : None for t in self.topics} self.kconsumer = SimpleConsumer(self.kafka_client, self.consumer_group, None, topic_partitions=self.consumed.copy()) except Exception as e: logging.exception(e) sys.exit(1)
def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn('Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info('Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)
class PerfConsumerSync ( threading.Thread ): running = True def __init__(self, factory, destination): self.factory = factory self.destination = destination self.consumer = SimpleConsumer(self.factory, "test-group", self.destination) self.rate = PerfRate() threading.Thread.__init__ ( self ) def run (self): while (self.running): textMessage = self.consumer.get_messages(block=True, timeout=1000000) if (textMessage != None): self.rate.increment() def stop(self): self.running = False def start(self): threading.Thread.start(self)
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format( action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__( self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']") self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() else: self.logger.info('Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value(lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info('Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn('Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info('Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
def check(self, instance): consumer_groups = self.read_config(instance, 'consumer_groups', cast=self._validate_consumer_groups) kafka_host_ports = self.read_config(instance, 'kafka_connect_str') full_output = self.read_config(instance, 'full_output', cast=bool) dimensions = self.read_config(instance, 'dimensions', cast=dict, optional=True) new_dimensions = {'component': 'kafka', 'service': 'kafka'} if dimensions is not None: new_dimensions.update(dimensions.copy()) try: # Connect to Kafka kafka_conn = KafkaClient(kafka_host_ports) # Query Kafka for consumer offsets consumer_offsets = {} topics = defaultdict(set) for consumer_group, topic_partitions in consumer_groups.iteritems(): for topic, partitions in topic_partitions.iteritems(): consumer = SimpleConsumer(kafka_conn, consumer_group, topic) # Remember the topic partitions that we've see so that we can # look up their broker offsets later topics[topic].update(set(partitions)) for partition in partitions: consumer_offsets[(consumer_group, topic, partition)] = consumer.offsets[partition] consumer.stop() # Query Kafka for the broker offsets, done in a separate loop so only one query is done # per topic even if multiple consumer groups watch the same topic broker_offsets = {} for topic, partitions in topics.items(): offset_responses = kafka_conn.send_offset_request([ OffsetRequest(topic, p, -1, 1) for p in partitions]) for resp in offset_responses: broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0] finally: try: kafka_conn.close() except Exception: self.log.exception('Error cleaning up Kafka connection') # Report the broker data if full_output: for (topic, partition), broker_offset in broker_offsets.items(): broker_dimensions = new_dimensions.copy() broker_offset = broker_offsets.get((topic, partition)) self.gauge('kafka.broker_offset', broker_offset, dimensions={'topic': topic, 'partition': partition}.update(broker_dimensions)) # Report the consumer data for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items(): # Get the broker offset broker_offset = broker_offsets.get((topic, partition)) # Report the consumer offset and lag consumer_dimensions = new_dimensions.copy() consumer_dimensions['topic'] = topic consumer_dimensions['partition'] = partition consumer_dimensions['consumer_group'] = consumer_group if full_output: self.gauge('kafka.consumer_offset', consumer_offset, dimensions={'topic': topic, 'partition': partition, 'consumer_group': consumer_group}.update(consumer_dimensions)) self.gauge('kafka.consumer_lag', broker_offset - consumer_offset, dimensions={'topic': topic, 'partition': partition, 'consumer_group': consumer_group}.update(consumer_dimensions))
class KafkaSpiderMixin(object): """ Mixin class to implement reading urls from a kafka queue. :type kafka_topic: str """ kafka_topic = None def process_kafka_message(self, message): """" Tell this spider how to extract urls from a kafka message :param message: A Kafka message object :type message: kafka.common.OffsetAndMessage :rtype: str or None """ if not message: return None return message.message.value def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) def next_request(self): """ Returns a request to be scheduled. :rtype: str or None """ message = self.consumer.get_message(True) url = self.process_kafka_message(message) if not url: return None return self.make_requests_from_url(url) def schedule_next_request(self): """Schedules a request if available""" req = self.next_request() if req: self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request()