def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load( "localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient( self.redis_monitor.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.outbound_firehose")
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) tempfile.write("\n") if tempfile.tell() > 12000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit( ) # inform zookeeper of position in the kafka queue
def test_switch_leader_simple_consumer(self): producer = Producer(self.client, async=False) consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10) self._send_random_messages(producer, self.topic, 0, 2) consumer.get_messages() self._kill_leader(self.topic, 0) consumer.get_messages()
def assert_message_count(self, topic, check_count, timeout=10, partitions=None, at_least=False): hosts = ','.join(['%s:%d' % (broker.host, broker.port) for broker in self.brokers]) client = SimpleClient(hosts, timeout=2) consumer = SimpleConsumer(client, None, topic, partitions=partitions, auto_commit=False, iter_timeout=timeout) started_at = time.time() pending = -1 while pending < check_count and (time.time() - started_at < timeout): try: pending = consumer.pending(partitions) except FailedPayloadsError: pass time.sleep(0.5) consumer.stop() client.close() if pending < check_count: self.fail('Too few pending messages: found %d, expected %d' % (pending, check_count)) elif pending > check_count and not at_least: self.fail('Too many pending messages: found %d, expected %d' % (pending, check_count)) return True
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True #print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) if tempfile.tell() > 120000000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit() # inform zookeeper of position in the kafka queue
def assert_message_count(self, topic, check_count, timeout=10, partitions=None, at_least=False): hosts = ','.join(['%s:%d' % (broker.host, broker.port) for broker in self.brokers]) client = KafkaClient(hosts) consumer = SimpleConsumer(client, None, topic, partitions=partitions, auto_commit=False, iter_timeout=timeout) started_at = time.time() pending = consumer.pending(partitions) # Keep checking if it isn't immediately correct, subject to timeout while pending < check_count and (time.time() - started_at < timeout): pending = consumer.pending(partitions) time.sleep(0.5) consumer.stop() client.close() if pending < check_count: self.fail('Too few pending messages: found %d, expected %d' % (pending, check_count)) elif pending > check_count and not at_least: self.fail('Too many pending messages: found %d, expected %d' % (pending, check_count)) return True
def createConsumer(self): self.consumer = SimpleConsumer(self.client, topic=self.config["topic"], group=self.config["consumerGroup"], auto_commit= True, max_buffer_size=3000000, iter_timeout=5)
def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0
def test_ts(self): kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2")) # consumer = SimpleConsumer(kafka, "my-group112", "test") consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: print("HELLO") # Prepare data for insert and copy to S3 # data_str = StringIO() count = 0 # last_offset = 2 consumer.seek(2, 0) for message in consumer.get_messages(count=100, block=False, timeout=0.1): count += 1 print(message.message.value) # # Write tweets to StringIO # self.write_to_data_str(message, data_str) # # Store batch tweets to S3 # self.write_to_s3(data_str, last_offset) if count != 100: break
def setup_kafka(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name self.topic ='general-starturls' _server=self.settings.get("KAFKA_LOCATION", 'localhost:9092') _partition_id = int(self.settings.get('SPIDER_PARTITION_ID', 0)) _group = self.settings.get("GROUP","scrapy-crawler") _conn = KafkaClient(_server) self.topic1 = self.settings.get('TOPIC', 'frontier-todo') mongo_server = self.settings.get("MONGODB_SERVER", 'localhost') mongo_port = self.settings.get("MONGODB_PORT", 'MONGODB_PORT') self.mng_client = MongoClient(mongo_server, mongo_port) self.consumer = SimpleConsumer(_conn,_group,self.topic1, partitions=[_partition_id], buffer_size=131072, max_buffer_size=1048576) self.producer = KafkaProducer(bootstrap_servers=[_server]) self.MONGODB_DB = self.settings.get("MONGODB_DB") self.MONGODB_COLLECTION = "shop" self.SPIDER_NAME = self.settings.get("SPIDER_NAME") self.JOB_NAME = self.settings.get("JOB_NAME") self.LOCALE = self.settings.get("LOCALE",'us') self.MONGODB_DB_INPUT = self.settings.get("MONGODB_DB_INPUT", "scr") self.NUM_REPETE = self.settings.get("NUMBER_REPETE_SCRAPE", 7) self.JOB_INPUT_COLLECTION = self.settings.get("JOB_INPUT_COLLECTION", "job_input3") self.ITEM_INPUT_COLLECTION = self.settings.get("ITEM_INPUT_COLLECTION" ,'scrap_input4') self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
def test_simple_consumer_commit_does_not_raise(self): client = MagicMock() client.get_partition_ids_for_topic.return_value = [0, 1] def mock_offset_fetch_request(group, payloads, **kwargs): return [ OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0) for p in payloads ] client.send_offset_fetch_request.side_effect = mock_offset_fetch_request def mock_offset_commit_request(group, payloads, **kwargs): raise FailedPayloadsError(payloads[0]) client.send_offset_commit_request.side_effect = mock_offset_commit_request consumer = SimpleConsumer(client, group='foobar', topic='topic', partitions=[0, 1], auto_commit=False) # Mock internal commit check consumer.count_since_commit = 10 # This should not raise an exception self.assertFalse(consumer.commit(partitions=[0, 1]))
def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) # move cursor to end of kafka topic self.consumer.seek(0, 2)
def __init__(self, name, host='web14', port=51092, **kwargs): QueueBase.QueueBase.__init__(self, name, host, port) self.__queue = [] self.__kafka = KafkaClient('%s:%d' % (host, port)) self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False)) self.__producer.client.ensure_topic_exists(self.name) self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1)
def consume(kafka_host): kafka = KafkaClient(kafka_host) consumer = SimpleConsumer(kafka, 'fetcher', cfg['kafka']['pages']) producer = SimpleProducer(kafka) consumer.max_buffer_size=20*1024*1024 for msg in consumer: page = json.loads(msg.message.value) process(page, producer) kafka.close()
def consume_save(group,topic): # tmp_save=open(tmp_file_path,"w") while True: kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) if not messages: print "Consumer didn't read any messages" for message in messages: # tmp_save.write( message.message.value+"\n") print message.message.value+"\n"
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Friendsquare topic then save it to a temporary file: temp_dir, then transfer the file to hdfs. Create a kafka receiver to grap messages ''' kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) # Create a temp file to store messages self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic) # Create a hdfs directory to store output files os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir) while self.count < self.max_count: # Get 1000 messages each time messages = kafka_receiver.get_messages(count=1000, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # Set each file size at 20 M if temp_file.tell() > 20000000: temp_file.close() # Put the file to hdfs hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
class HBaseServer(threading.Thread): """ HBase thread that will continuously read from Kafka queue """ def __init__(self, kafka_url, kafka_topic, hbase_url, hbase_thrift_port, hbase_table): threading.Thread.__init__(self) self.kafka = KafkaClient(kafka_url) self.cons = SimpleConsumer(self.kafka, None, kafka_topic) self.cons.seek(0, 2) self.hbase_connect = happybase.Connection(hbase_url, hbase_thrift_port) self.car_table = self.hbase_connect.table(hbase_table) self.server_on_flag = True self.m = None self.payload = None self.vin = None self.time = None self.data = None self.row_key = None self.count = 0 def run(self): while self.server_on_flag: self.m = self.cons.get_message(block=False) if (self.m is not None): self.payload = json.loads(self.m.message.value) self.vin = str(self.payload['vin']) self.time = str(self.payload['timestamp']) self.data = str(self.payload['data']) self.row_key = self.vin + self.time try: self.car_table.put(self.vin, {'user:mostrecent': self.time}) self.car_table.put(self.row_key, {'car:data': self.data}) self.count = self.count + 1 logger.info( 'HBase Server: key: %s, table: %s, car{data: %s}. Message number: %s', self.row_key, 'rvi', self.data, str(self.count)) except Exception as e: logger.info('%s,Data Push into HBase unsuccessful...', e) else: sleep(1 / 5) def shutdown(self): self.server_on_flag = False logger.info('HBase Server shutting down...')
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Kafka then save it to a temporary first, then transfer the file to hdfs. ''' # Create a kafka receiver to grap messages kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) self.timestamp = self.getTimestamp() # Create a temp file to store messages self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') while self.count < self.max_count: # Get 100 messages each time messages = kafka_receiver.get_messages(count=100, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # For structured streaming, files need to be small at this point, set the size at 2 M if temp_file.tell() > 2000000: temp_file.close() # Copy the file to hdfs output_dir = "%s/%s" % (self.hdfs_dir, topic) os.system("hdfs dfs -mkdir %s" % output_dir) hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.timestamp = self.getTimestamp() self.temp_file_path = "%s/%s_%s.txt" % ( temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def setup_capture_new_messages_consumer(topic): """Seeks to the tail of the topic then returns a function that can consume messages from that point. """ kafka = KafkaClient(get_config().cluster_config.broker_list) group = str('data_pipeline_clientlib_test') consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=_ONE_MEGABYTE) consumer.seek(0, 2) # seek to tail, 0 is the offset, and 2 is the tail yield consumer kafka.close()
def consume_save(group,topic): tmp_save=open(tmp_file_path,"w") kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) if not messages: print "Consumer didn't read any messages" for message in messages: tmp_save.write( message.message.value+"\n") # print message.message.value+"\n" kafka_consumer.commit() # inform zookeeper of position in the kafka queu print ".... ... .. .." print "Message from topic \"%s\" consumed \n" % topic
def run(self, options=None): # try: # Create table if it doesn't exist in the database if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False: self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE) kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2")) consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: # Prepare data for insert and copy to S3 data_str = StringIO() csv_str = StringIO() count = 0 # Get Offset from previous read s3_last_offset = self.get_s3_offset() (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0] last_offset = last_offset if last_offset else 0 # Resolve difference in offset (s3 offset does not carry over from day to day) if s3_last_offset > last_offset: last_offset = s3_last_offset self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) print(last_offset) # Read from Offset consumer.seek(last_offset, 0) for message in consumer.get_messages(count=self.BATCH_SIZE, block=False, timeout=5): # Write tweets to StringIO self.write_to_data_str(message, data_str, csv_str) count += 1 last_offset += 1 # Store batch tweets to S3 self.write_to_s3(data_str, csv_str, last_offset) # Track Kafka Offset self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) if count != self.BATCH_SIZE: break
def test_simple_consumer_failed_payloads(self): client = MagicMock() consumer = SimpleConsumer(client, group=None, topic='topic', partitions=[0, 1], auto_commit=False) def failed_payloads(payload): return FailedPayloadsError(payload) client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads) # This should not raise an exception consumer.get_messages(5)
def __init__(self, kafka_addr, topic, vin, web_url): threading.Thread.__init__(self) self.kafka = KafkaClient(kafka_addr) #kafka_addr self.cons = SimpleConsumer(self.kafka, None, topic) self.cons.seek(0,2) self.vin = vin self.web_url = web_url self.flag = True self.count = 0 self.sleep_count = 0 self.headers = {'Content-Type' : 'application/json'}
class HBaseServer(threading.Thread): """ HBase thread that will continuously read from Kafka queue """ def __init__(self, kafka_url, kafka_topic, hbase_url, hbase_thrift_port, hbase_table): threading.Thread.__init__(self) self.kafka = KafkaClient(kafka_url) self.cons = SimpleConsumer(self.kafka, None, kafka_topic) self.cons.seek(0,2) self.hbase_connect = happybase.Connection(hbase_url,hbase_thrift_port) self.car_table = self.hbase_connect.table(hbase_table) self.server_on_flag = True self.m = None self.payload = None self.vin = None self.time = None self.data = None self.row_key = None self.count = 0 def run(self): while self.server_on_flag: self.m = self.cons.get_message(block=False) if (self.m is not None): self.payload = json.loads(self.m.message.value) self.vin = str(self.payload['vin']) self.time = str(self.payload['timestamp']) self.data = str(self.payload['data']) self.row_key = self.vin+self.time try: self.car_table.put(self.vin,{'user:mostrecent':self.time}) self.car_table.put(self.row_key,{'car:data':self.data}) self.count = self.count + 1 logger.info('HBase Server: key: %s, table: %s, car{data: %s}. Message number: %s', self.row_key, 'rvi', self.data, str(self.count)) except Exception as e: logger.info('%s,Data Push into HBase unsuccessful...', e) else: sleep(1/5) def shutdown(self): self.server_on_flag = False logger.info('HBase Server shutting down...')
def test_simple_consumer_reset_partition_offset(self): client = MagicMock() def mock_offset_request(payloads, **kwargs): raise FailedPayloadsError(payloads[0]) client.send_offset_request.side_effect = mock_offset_request consumer = SimpleConsumer(client, group='foobar', topic='topic', partitions=[0, 1], auto_commit=False) # This should not raise an exception self.assertEqual(consumer.reset_partition_offset(0), None)
def _connect_consumer(self): if self._cons is None: try: self._cons = SimpleConsumer(self._conn, self._group, self._topic, partitions=self._partition_ids, buffer_size=1048576, max_buffer_size=10485760) except BrokerResponseError: self._cons = None logger.warning("Could not connect consumer to Kafka server") return False return True
def kafka_consumer(kafka_hosts, schema_host, schema_port, topic, consumer_group="python"): """ 消费kafka对应topic的记录, 非实时消费 :param kafka_hosts: :param schema_host: :param schema_port: :param topic: :param consumer_group: :return: """ # 获取topic最新schema topic_schema, topic_schema_id, schema_version = get_latest_schema_info( schema_host, schema_port, topic) # 消费kafka记录 client = KafkaClient(hosts=kafka_hosts) simple_consumer = SimpleConsumer(client, consumer_group, topic, auto_offset_reset="smallest") collect_logs = [] # 存放消息记录的partition,offset,value msg_exist = True while msg_exist: msg = simple_consumer.get_message(get_partition_info=True) # print "kafka log:", msg # 判断此次获取的记录是否为None,为None则停止消费 if msg is None: msg_exist = False else: msg_partition = msg[0] msg_offset = msg[1].offset msg_value = msg[1].message.value # 对单条记录解码 bytes_msg = io.BytesIO(msg_value[5:]) decode_msg = avro.io.BinaryDecoder(bytes_msg) recode_msg = avro.io.DatumReader( avro.schema.parse(topic_schema)).read(decode_msg) # 收集该log的partition,offset,value信息 msg_collect = [msg_partition, msg_offset, recode_msg] collect_logs.append(msg_collect) collect_logs.sort(key=lambda x: x[0]) # 按partition id排序 print "+++++++Topic: %s+++++++" % topic for index, log in enumerate(collect_logs): print index, log print "Successfully received." return collect_logs
class RVIConsumer(threading.Thread): def __init__(self, kafka_addr, topic, vin, web_url): threading.Thread.__init__(self) self.kafka = KafkaClient(kafka_addr) #kafka_addr self.cons = SimpleConsumer(self.kafka, None, topic) self.cons.seek(0,2) self.vin = vin self.web_url = web_url self.flag = True self.count = 0 self.sleep_count = 0 self.headers = {'Content-Type' : 'application/json'} def is_running(self): return self.flag def run(self): while self.flag: #cons = SimpleConsumer(kafka, None, 'rvi') m = self.cons.get_message(block=False) if (m is not None): payload = json.loads(m.message.value) if(payload['vin'] == self.vin): self.sleep_count = 0 payloadtoweb = json.dumps(m.message.value) r = requests.post(self.web_url, data=payloadtoweb, headers=self.headers) if (r.status_code is 200): print m.message.value + " sent successfully\n" else: print "%s is not available, status code:%d...shutting down now..."%(self.web_url,r.status_code) self.shutdown() else: if (self.sleep_count > 100000): print "No new data for %s... Timing out" % self.vin self.shutdown() time.sleep(1/5) self.sleep_count = self.sleep_count + 1 def shutdown(self): self.flag = False requests.post(self.web_url, data=json.dumps({'vin':self.vin, 'data':'EOM'}), headers=self.headers) print "%s consumer thread shutting down" % self.vin
def read_kafka(): """ read socialSignal, keep if non are zero, save to mongo social/socialSignal :return: """ msg_buffer = dict() ids = set() in_kafka = KafkaClient(settings.IN_SOCIAL_SIGNAL_KAFKA) consumer = SimpleConsumer(in_kafka, 'comment.pages1', 'comment.pages', max_buffer_size=20 * 1024 * 1024, fetch_size_bytes=2 * 1024 * 1024, buffer_size=2 * 1024 * 1024) consumer.seek(0, 0) for msg in consumer: if "001WxC6D" in msg.message.value: print msg.message.value
def validate_samza_job(): """ Validates that negate-number negated all messages, and sent the output to samza-test-topic-output. """ logger.info("Running validate_samza_job") kafka = util.get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer(kafka, "samza-test-group", TEST_OUTPUT_TOPIC) messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300) message_count = len(messages) assert NUM_MESSAGES == message_count, "Expected {0} lines, but found {1}".format(NUM_MESSAGES, message_count) for message in map(lambda m: m.message.value, messages): assert int(message) < 0, "Expected negative integer but received {0}".format(message) kafka.close()
def consume_save(group,topic): i=0 tmp_save=open(tmp_file_path,"w") while True: kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) # if not messages: # print "Consumer didn't read any messages" for message in messages: tmp_save.write( message.message.value+"\n") print message.message.value+"\n" # file size > 20MB if tmp_save.tell() > 20000000: push_to_hdfs(tmp_file_path) kafka_consumer.commit() # inform zookeeper of position in the kafka queu
def validate_samza_job(): """ Validates that negate-number negated all messages, and sent the output to samza-test-topic-output. """ logger.info('Running validate_samza_job') kafka = _get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC) messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=60) message_count = len(messages) assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count) for message in map(lambda m: m.message.value, messages): assert int(message) < 0 , 'Expected negative integer but received {0}'.format(message) kafka.close()
class Consumer(Thread): def __init__(self, args=()): super(Consumer, self).__init__() self.host = args[0] self.port = args[1] self.topic = args[2] print '[KafkaConsumer] host: {0}, port: {1}, topic: {2}'.format(self.host, self.port, self.topic) self.consumer = None self.consumer_keep_run = True self.consumer_paused = False self.consumer_subscribers = [] def run(self): client = kafka_client(self.host, self.port) self.consumer = SimpleConsumer(client, None, self.topic) self.consumer.seek(0, 1) while self.consumer_keep_run: print '[KafkaConsumer] looping..' if not self.consumer_paused: for message in self.consumer.get_messages(block=False): offset = message.offset value = message.message.value j_encoded = json.dumps({'offset': offset, 'message': value}) print '[KafkaConsumer] {}'.format(j_encoded) for subscriber in self.consumer_subscribers: IOLoop.instance().add_callback(partial(subscriber.send_message, j_encoded)) time.sleep(1) def pause_consumer(self, paused): self.consumer_paused = paused def stop_consumer(self): self.consumer_keep_run = False def add_subscriber(self, subscriber): self.consumer_subscribers.append(subscriber) def remove_subscriber(self, subscriber): self.consumer_subscribers.remove(subscriber) def get_subscribers_length(self): length = len(self.consumer_subscribers) return length def get_subscribers(self): return self.subscribers
def serve_user(user): consumer = SimpleConsumer(CLIENT, 'testing', 'user{}_sess{}'.format(user,user)) msg = None msg = consumer.get_message() RECEIVE_TIME = time.time() color='yellow' S_R_LAG = RECEIVE_TIME-SEND_TIME if SEND_TIME else None if msg: print("received message: {} delay: {}".format(msg.message.value.decode(), S_R_LAG)) if msg.message.value.decode() =='True': color='green' else: color='red' return render_template('keylog.html', bgcolor=color)
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlingStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages(count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update(map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1
def init_get_stream(self, get_message_stream, queue_name_spec, starting_marker, echo_requested, include_claimed): self.logger.info("KafkaDriver prepare_to_get_messages got: queue_name=%s, echo_requested=%s, include_claimed=%s, starting_marker=%s" % (queue_name_spec,str(echo_requested),str(include_claimed),starting_marker)) self.logger.info("warning: KafkaDriver ignores echo_requested and include_claimed in GET requests") self.consume_group = "cg1" # default consume group if len(starting_marker) > 0: self.consume_group = starting_marker self.logger.info("consume group="+self.consume_group) # if the queue name contains "/n" at the end, we interpret that is referring to partition to read from queue_name, partition_part = queue_name_spec.split("/",2) if partition_part is None: partition = None else: partition = int(partition_part) self.logger.info("limiting topic %s to partition %d" % (queue_name, partition)) self.get_message_stream = get_message_stream self.queue_name = str(queue_name) self.consumer = SimpleConsumer( client=self.kafka, group=self.consume_group, topic=self.queue_name, partitions=[partition], auto_commit=False, # it seems we cannot do any kind of commit when using kafka-pythong 0.9.1 with Kafka versions before 0.8.1 because kafka-python will send a OffsetFetchReqeust (request type 9) or OffsetCommitRequest (request type 8) which is not supported fetch_size_bytes= self.MAX_KAFKA_REQ_BATCH_MSGS*4096, # in Marconi,messages can be up to 4k iter_timeout=None, ) self.logger.debug("KafkaDriver: seeking to head of %s" % (self.queue_name)) self.consumer.seek(0,0) # seek to head of topic; TODO: should get starting position from starting_marker param self.periodically_check_for_new_messages() # kick of periodic attainment of new messages (space permitting)
def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient(self.redis_monitor.settings[ 'KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer( self.kafka_conn, "demo-id", "demo_test.outbound_firehose" )
class KafkaConsumer(object): def __init__(self, conf): self.log = logging.getLogger(__name__) self.client = KafkaClient(conf["kafka_server"]) self.total_inserts = 0 self.inserts = 0 self.listenstore = None def start_listens(self, listenstore): self.listenstore = listenstore return self.start(b"listen-group", b"listens") def start(self, group_name, topic_name): self.group_name = group_name self.topic_name = topic_name self.log.info("KafkaConsumer subscribed to %s -> %s" % (group_name, topic_name)) self.consumer = SimpleConsumer(self.client, self.group_name, self.topic_name) t0 = 0 last_offset = -1 while True: listens = [] if t0 == 0: t0 = time() messages = self.consumer.get_messages(count=CASSANDRA_BATCH_SIZE, block=True, timeout=KAFKA_READ_TIMEOUT) for message in messages: try: data = ujson.loads(message.message.value) listens.append(Listen.from_json(data)) except ValueError as e: self.log.error("Cannot parse JSON: %s\n'%s'" % (str(e), message.message.value)) continue last_offset = message.offset if listens: broken = True while broken: try: self.listenstore.insert_batch(listens) broken = False except ValueError as e: self.log.error("Cannot insert listens: %s" % unicode(e)) broken = False except NoHostAvailable as e: self.log.error("Cannot insert listens: %s. Sleeping, trying again." % unicode(e)) sleep(5) self.inserts += len(messages) if self.inserts >= REPORT_FREQUENCY: t1 = time() self.total_inserts += self.inserts self.log.info("Inserted %d rows in %.1fs (%.2f listens/sec). Total %d rows. last offset: %d" % \ (self.inserts, t1 - t0, self.inserts / (t1 - t0), self.total_inserts, last_offset)) self.inserts = 0 t0 = 0
def __init__(self, kafka_hostport, topic, group=None, **kwargs): if not group: group = str(uuid.uuid4()) self.kafka = get_client(kafka_hostport) self.consumer = SimpleConsumer(self.kafka, group, topic, max_buffer_size=1048576 * 32, **kwargs)
def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer( self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024*100, fetch_size_bytes=1024*100, max_buffer_size=None ) # move cursor to end of kafka topic self.consumer.seek(0, 2)
def kafka_stream(): # global visualization_topic # topic = visualization_topic # print "DEBUG stream topic: " + topic topic = "web" kafka = KafkaClient("localhost:9092") consumer = SimpleConsumer(kafka, "python", topic) consumer.seek(offset=0, whence=2) # topic = None def gen(): for message in consumer: yield 'data: %s\n\n' %str(message.message.value) print "DEBUG: Kafka Stream Connected" return Response(gen(), mimetype="text/event-stream")
def test_simple_consumer_unknown_topic_partition(self): client = MagicMock() consumer = SimpleConsumer(client, group=None, topic='topic', partitions=[0, 1], auto_commit=False) # Mock so that only the first request gets a valid response def unknown_topic_partition(request): return FetchResponsePayload(request.topic, request.partition, UnknownTopicOrPartitionError.errno, -1, ()) client.send_fetch_request.side_effect = self.fail_requests_factory(unknown_topic_partition) # This should not raise an exception with self.assertRaises(UnknownTopicOrPartitionError): consumer.get_messages(20)
def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {}
def kafka_stream(): # global visualization_topic # topic = visualization_topic # print "DEBUG stream topic: " + topic topic = "web" kafka = KafkaClient("localhost:9092") consumer = SimpleConsumer(kafka, "python", topic) consumer.seek(offset=0, whence=2) # topic = None def gen(): for message in consumer: yield 'data: %s\n\n' % str(message.message.value) print "DEBUG: Kafka Stream Connected" return Response(gen(), mimetype="text/event-stream")
def read_kafka(docid): """ read socialSignal, keep if non are zero, save to mongo social/socialSignal :return: """ msg_buffer = dict() ids = set() in_kafka = KafkaClient(settings.IN_SOCIAL_SIGNAL_KAFKA) consumer = SimpleConsumer(in_kafka, 'test0', TOPIC, max_buffer_size=20 * 1024 * 1024, fetch_size_bytes=2 * 1024 * 1024, buffer_size=2 * 1024 * 1024) consumer.seek(6000000, 0) for msg in consumer: if msg.offset % 100000 == 0: print 'working on ', msg.offset if docid in msg.message.value: print msg.message.value
def __init__(self, topic, kafka_broker, consumer_group): self.kafka = KafkaClient(kafka_broker) self.consumer = SimpleConsumer(self.kafka, consumer_group, topic, fetch_size_bytes=self.__max_buffer_size, buffer_size=self.__max_buffer_size, max_buffer_size=self.__max_buffer_size)
def main(): """ Usage: dump_to_mongodb dump <topic> --host=<host> [--consumer=<consumer>] """ args = docopt(main.__doc__) host = args["--host"] print "=> Connecting to {0}...".format(host) logger.info("=> Connecting to {0}...".format(host)) kafka = KafkaClient(host) print "=> Connected." logger.info("=> Connected.") if args["dump"]: topic = args["<topic>"] consumer_id = args["--consumer"] or "dump_to_mongodb" consumer = SimpleConsumer( kafka, consumer_id, topic, buffer_size=1024 * 200, # 100kb fetch_size_bytes=1024 * 200, # 100kb max_buffer_size=None # eliminate big message errors ) consumer.seek(0, 1) while True: try: message = consumer.get_message() if message is None: time.sleep(1) continue val = message.message.value logger.info("message.message.value== %s " % val) print('val==', val) try: item = json.loads(val) except: continue if 'meta' in item and 'collection_name' in item['meta']: _insert_item_to_monggodb(item) except: traceback.print_exc() break kafka.close() return 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 30s") continue #break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) print message kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") log_has_at_least_one = False #did we log at least one entry? while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 1000 messages at a time, non blocking if not messages: break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) log_has_at_least_one = True #print(message.message.value) tempfile.write(message.message.value + "\n") if tempfile.tell() > 10000000: #file size > 10MB flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def connection_time(): cnt = 0 kafka = KafkaClient("localhost:9092") consumer = SimpleConsumer(kafka, "test", "twitter") start_time = time.time() for msg in consumer: cnt += 1 if cnt > 0: return time.time() - start_time
class KafkaSpout(Spout): def initialize(self, stormconf, context): # self.words = itertools.cycle(['dog', 'cat', # 'zebra', 'elephant']) self.kafka = KafkaClient("cloud.soumet.com:9092") self.consumer = SimpleConsumer(self.kafka, "storm", "realtime", max_buffer_size=1310720000) def next_tuple(self): for message in self.consumer.get_messages(count=500, block=False):#, timeout=1): #transaction_data = TransactionFull() #transaction_data.ParseFromString(base64.b64decode(message.message.value)) #self.emit([transaction_data]) self.emit([message.message.value]) self.consumer.commit()
class QueueKafka(QueueBase.QueueBase): @QueueBase.catch def __init__(self, name, host='web14', port=51092, **kwargs): QueueBase.QueueBase.__init__(self, name, host, port) self.__queue = [] self.__kafka = KafkaClient('%s:%d' % (host, port)) self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False)) self.__producer.client.ensure_topic_exists(self.name) self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1) def __del__(self): if self.__kafka: [self.put(x.message.value) for x in self.__queue] self.__kafka.close() @QueueBase.catch def put(self, value, *args, **kwargs): if isinstance(value, dict) or isinstance(value, list): self.__producer.send_messages(self.name, json.dumps(value)) else: self.__producer.send_messages(self.name, value.encode('utf-8') if isinstance(value, unicode) else value) @QueueBase.catch def get(self, *args, **kwargs): if not self.__queue: self.__consumer._fetch() kq = self.__consumer.queue while not kq.empty(): partition, result = kq.get_nowait() self.__queue.append(result) self.__consumer.offsets[partition] += 1 self.__consumer.count_since_commit += 1 self.__consumer.queue = Queue() self.__consumer.commit() return self.__queue.pop().message.value if self.__queue else None @QueueBase.catch def size(self, *args, **kwargs): count = 0 for k, v in self.__consumer.offsets.items(): reqs = [common.OffsetRequest(self.name, k, -1, 1)] (resp, ) = self.__consumer.client.send_offset_request(reqs) count += (resp.offsets[0] - v) return count + len(self.__queue)