class SimpleProducer(BaseStreamProducer): def __init__(self, location, enable_ssl, cert_path, topic, compression, **kwargs): self._location = location self._topic = topic self._compression = compression self._create(enable_ssl, cert_path, **kwargs) def _create(self, enable_ssl, cert_path, **kwargs): max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE) kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}) self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, compression_type=self._compression, max_request_size=max_request_size, **kwargs) def send(self, key, *messages): for msg in messages: self._producer.send(self._topic, value=msg) def flush(self): self._producer.flush() def close(self): self._producer.close()
def test_end_to_end(kafka_broker): connect_str = 'localhost:' + str(kafka_broker.port) producer = KafkaProducer(bootstrap_servers=connect_str, max_block_ms=10000, value_serializer=str.encode) consumer = KafkaConsumer(bootstrap_servers=connect_str, group_id=None, consumer_timeout_ms=10000, auto_offset_reset='earliest', value_deserializer=bytes.decode) topic = random_string(5) for i in range(1000): producer.send(topic, 'msg %d' % i) producer.flush() producer.close() consumer.subscribe([topic]) msgs = set() for i in range(1000): try: msgs.add(next(consumer).value) except StopIteration: break assert msgs == set(['msg %d' % i for i in range(1000)])
def run(): parser = get_args_parser() try: parse_result = parser.parse_args() topic_name = parse_result.topic num_records = parse_result.num_records record_size = parse_result.record_size producer_props = parse_result.producer_config props = {} for prop in producer_props: k, v = prop.split('=') try: v = int(v) except ValueError: pass props[k] = v producer = KafkaProducer(**props) record = bytes(bytearray(record_size)) stats = Stats(num_records, 5000) for i in xrange(num_records): send_start_ms = get_time_millis() future = producer.send(topic=topic_name, value=record) future.add_callback(stats.next_completion( send_start_ms, record_size, stats)) producer.close() stats.print_total() except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) sys.exit(1)
def sendSingleMsg2Kafka(msg): if not msg: return producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092') producer.send('topic_lpr', msg.encode('utf8')) producer.flush() producer.close(timeout=5)
def produce_to_bruce(schema, args, config): topic = config['kafka']['topic'] if args.partition_count: partition_count = args.partition_count else: print 'fetch partition info for topic ' + topic producer = KafkaProducer(bootstrap_servers = config['kafka']['brokers']) partition_count = 1 + max(producer.partitions_for(topic)) producer.close() socket = bruce.open_bruce_socket() # batching socket send buff = [] def flush_buff(): for msg in buff: socket.sendto(msg, '/var/run/bruce/bruce.socket') del buff[:] def f_produce(topic, partition, key, value): if len(buff) < 1000: buff.append(bruce.create_msg(partition, topic, bytes(key), bytes(value))) else: flush_buff() try: bootstrap(f_produce, partition_count, schema, args.database, args.table, config) flush_buff() except KeyboardInterrupt: sys.exit(1) finally: socket.close()
def send_message(self,message,topic_partition): self._logger.info("Sending message to: Topic: {0} Partition:{1}".format(self._topic,topic_partition)) kafka_brokers = '{0}:{1}'.format(self._server,self._port) producer = KafkaProducer(bootstrap_servers=[kafka_brokers],api_version_auto_timeout_ms=3600000) future = producer.send(self._topic,message,partition=topic_partition) producer.flush() producer.close()
def run(self): producer = KafkaProducer(bootstrap_servers='localhost:9092') while not self.stop_event.is_set(): producer.send('my-topic', b"test") producer.send('my-topic', b"\xc2Hola, mundo!") time.sleep(1) producer.close()
def run(self): producer = KafkaProducer(bootstrap_servers='localhost:9092') while not self.stop_event.is_set(): print("Sending message from: " + str(threading.get_ident())) producer.send('my-topic', b"test") producer.send('my-topic', b"\xc2Hola, mundo!") time.sleep(0.2) producer.close()
def _push(self, payload): if super(KafkaService, self)._push(payload): LOGGER.info("Pushing payload to kafka: %s", str(payload)) brokers = self.destination_config['brokers'].split(',') topic = self.destination_config['topic'] kafka_producer = KafkaProducer(bootstrap_servers=brokers) for values in payload: kafka_producer.send(topic, str(values).encode('utf-8')) kafka_producer.flush(3) kafka_producer.close(3) else: LOGGER.warn("Payload is none, nothing to push.")
def producer_(): from kafka import KafkaProducer producer = KafkaProducer(bootstrap_servers='192.168.1.101:9092') msg_dict = {'msg': 'Hello World', 'sleep_time': 10, 'db_config': 'retry' } msg = json.dumps(msg_dict) producer.send('test_rhj',msg,partition=0) producer.close()
def main(directory, topic, byline): #get a hdfs object myHdfs = hdfs.hdfs() myPath = myHdfs.walk(directory) # a global variable global producer # Get a producer object producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2) for myfile in myPath: #Skip directory recursive if myfile["kind"] == "directory": logger.debug("ignoring %s" %(myfile)) continue elif myfile["kind"] == "file": pass else: raise Exception, "Unknown kind %s for %s" %(myfile["kind"], myfile["name"]) #Skip name in particoular if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]: logger.debug("ignoring %s" %(myfile)) continue #Skip 0 size files if myfile["size"] == 0: logger.debug("ignoring %s" %(myfile)) continue logger.info("Working on %s" %(myfile["name"])) #call processChunk if I want to submit chunk if byline is False: processChunk(myfile, topic) else: #Otherwise submit line by line processLine(myfile, topic) #with file open logger.info("Completed %s" %(myfile["name"])) #sleep some time time.sleep(1) # for all files in HDFS producer.close()
def produce_to_kafka(schema, args, config): topic = config['kafka']['topic'] producer = KafkaProducer(bootstrap_servers = config['kafka']['brokers']) def f_produce(topic, partition, key, value): producer.send(topic, key = key, value = value, partition = partition) partition_count = 1 + max(producer.partitions_for(topic)) try: bootstrap(f_produce, partition_count, schema, args.database, args.table, config) except KeyboardInterrupt: sys.exit(1) producer.flush() producer.close()
class KafkaPublisher( AbstractPublisher, ): """ 使用kafka作为中间件 """ # noinspection PyAttributeOutsideInit def custom_init(self): self._producer = KafkaProducer( bootstrap_servers=frame_config.KAFKA_BOOTSTRAP_SERVERS) self._admin_client = KafkaAdminClient( bootstrap_servers=frame_config.KAFKA_BOOTSTRAP_SERVERS) try: self._admin_client.create_topics( [NewTopic(self._queue_name, 10, 1)]) # admin_client.create_partitions({self._queue_name: NewPartitions(total_count=16)}) except TopicAlreadyExistsError: pass except Exception as e: self.logger.exception(e) atexit.register(self.close) # 程序退出前不主动关闭,会报错。 def concrete_realization_of_publish(self, msg): # noinspection PyTypeChecker # self.logger.debug(msg) # print(msg) self._producer.send( self._queue_name, msg.encode(), ) def clear(self): self.logger.warning('还没开始实现 kafka 清空 消息') # self._consumer.seek_to_end() # self.logger.warning(f'将kafka offset 重置到最后位置') def get_message_count(self): # return -1 # 还没找到获取所有分区未消费数量的方法 。 # print(self._admin_client.list_consumer_group_offsets('frame_group')) # print(self._admin_client.describe_consumer_groups('frame_group')) return -1 def close(self): self._producer.close() def _at_exit(self): self._producer.flush() super()._at_exit()
class KafkaTweetSerializer: _producer = None def __init__(self, host='localhost', port='9092'): kafka_server = "{0}:{1}".format(host, str(port)) self._producer = KafkaProducer(bootstrap_servers=[kafka_server], value_serializer=lambda v: json.dumps(v).encode('utf-8')) def write(self, message): self._producer.send(topic='tweets', value=message) self._producer.flush() print "Tweet!" def end(self): self._producer.close()
class SessionProducer(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.producer = KafkaProducer(bootstrap_servers=['localhost:9092']) def close(self): self.producer.close() def run(self): for session_file in glob.glob('./entree/session/session.*'): session = Utilities.getSessionData(session_file) print(session) msg = json.dumps(session).encode('utf-8') self.producer.send('session_data', msg) time.sleep(1)
class Producer: def __init__(self, bootstrap_servers): self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers) def send(self, topic, key=None, value=None): try: key = key.encode() if key else None value = value.encode() if value else None self.producer.send(topic, key=key, value=value) except Exception: raise finally: self.producer.flush() def close(self): self.producer.close()
def run(self, topic_name): producer = KafkaProducer(bootstrap_servers=self.bootstrap_server, acks=self.ack, retries=self.retries, linger_ms=self.linger_ms, key_serializer=str.encode, value_serializer=str.encode) for key in range(10): # Since we specified 'key' here value will be written in that key. producer.send(topic_name, key=str(key), value="message sent %s" % key) producer.close()
def print_lines(filepath, kafka_topic=None): pr = print producer = None if kafka_topic: from kafka import KafkaProducer producer = KafkaProducer(bootstrap_servers="kafka:9092") pr = lambda line: producer.send(kafka_topic, line.encode('utf-8')) try: for line in generate_lines(filepath): pr(line) time.sleep(0.2) finally: if producer: producer.close()
def run(self): #Bootstraps an instance of a Kafka producer. #Initializes the producer and identifies the docker server. #kafka-spotify is listed in /etc/hosts with the ip of the container producer = KafkaProducer(bootstrap_servers='kafka-spotify:9092') #loop until the thread is stopped by checking the stop event while not self.stop_event.is_set(): #Send two messages of type binary to the 'test' Topic producer.send('test', b"test") producer.send('test', b"Hola, mundo!") #Sleep for 3 seconds time.sleep(3) #Close the TCP stream to Kafka producer.close()
def produce(self): producer = KafkaProducer(bootstrap_servers='10.110.87.202:9092') msg_dict = { "sleep_time": 10, "db_config": { "database": "test_1", "host": "xxxx", "user": "******", "password": "******" }, "table": "msg", "msg": "Hello World" } msg = json.dumps(msg_dict) producer.send('test_rhj', msg, partition=0) producer.close()
def Producer(): # producer = KafkaProducer(bootstrap_servers ='10.42.2.92:9092') producer = KafkaProducer( bootstrap_servers='10.42.2.92:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8')) print('send to kafka start!') # msg = json.dumps(msg1) # print type(msg) # msg = json.dumps(mess) # print type(msg) # producer.send(topic, msg.encode()) producer.send(topic, mess) # producer.send(topic, msg1) time.sleep(1) print('send to kafka finished!') producer.close()
def transfer_msg(self, msg): _vcenter_mq_url = "{}:{}".format( self._vcenter_cfg["message-queue"]["ipaddr"], self._vcenter_cfg["message-queue"]["port"]) _vcenter_mq_topic = self._vcenter_cfg["message-queue"]["topic-map"][ "flow.measure"] try: encoded_msg = json.dumps(msg).encode('utf-8') producer = KafkaProducer(bootstrap_servers=[_vcenter_mq_url]) producer.send(_vcenter_mq_topic, encoded_msg) producer.close() except NoBrokersAvailable as noBrokerExt: self._logger.error( "Kafka Broker in Security Post is not accessible")
def send_json_kafka(self): producer = KafkaProducer( value_serializer=lambda v: json.dumps(v).encode('utf-8'), bootstrap_servers=self.bstrap_servers) for i in range(10000): time.sleep(1) data = { "id": i, "name": "zhangsan" + str(i), "sex": "m", "age": 30, "dates": timestamp_now() } log.info("往kafka输入的data:%s", data) producer.send(self.json_topic, data) producer.close()
def send_goal_event(timestamp: int, topic: str): # Hacky # TODO: Fix this # Create producer for the kafka topic do get ready to publish kafka_producer = KafkaProducer(bootstrap_servers=KAFKA_SERVER, api_version=(2, 5, 0)) payload = dict(timestamp=timestamp) # JSON to bytearray payload_bytes = json.dumps(payload).encode('utf-8') print('Sending payload:', payload, payload_bytes) kafka_producer.send(topic, payload_bytes) kafka_producer.close()
def shutdown_hook(producer: KafkaProducer): """ a shutdown hook to be called before the shutdown """ try: logger.info("Flushing pending message to kafka, timeout is set to 10s.") producer.flush(10) logger.info("Flushed flushing pending message to kafka") except KafkaError as kafka_error: logger.warning(f"Flushed to flush pending message to kafka, caused by: {kafka_error}") finally: try: logger.info("Closing kafka connection.") producer.close() except Exception as e: logger.warning(f"Failed to close kafka connection, caused by {e}")
def produce(brokers:list,topic:str): producer = KafkaProducer(bootstrap_servers=brokers, key_serializer=string_to_bytes, value_serializer=json_serialization) i = 0 while i<1000: sleep(1) key = "message_key_" + str(i) value = {"id": i, "date": 100000 *(2+i**3), "info": "sensor_" + str(i)} print("Message Produced: key = {} value = {}".format(key, value)) producer.send(topic=topic, key=key, value=value) i += 1 producer.flush() producer.close()
class Kafka: def __init__(self, hosts, topicName): self.hosts = hosts self.topic = topicName self.kafkaProducer = KafkaProducer(bootstrap_servers=self.hosts) def producer(self, value, key=None, sleepTime=1): if key is None: key = str(datetime.now()).encode() self.kafkaProducer.send(self.topic, value, key) print("%s:插入kafka成功" % datetime.now()) time.sleep(sleepTime) # 生产者关闭 def producerClose(self): self.kafkaProducer.close()
class KafkaProduceServer(object): def __init__(self, topic, server): if type(server) != list: server = [server] self._topic=topic self._producer = KafkaProducer(bootstrap_servers=server,value_serializer=lambda m: json.dumps(m).encode('ascii')) def getProducer(self): return self._producer def sendMsg(self,msg): self._producer.send(self._topic,msg) self._producer.flush() def sendJson(self,key,json): self._producer.send(self._topic,key=key,value=json) self._producer.flush() def close(self): self._producer.close()
class KafkaHandler(logging.Handler): """Class to instantiate the kafka logging facility.""" def __init__(self, hostlist, topic='logs', tls=None): """Initialize an instance of the kafka handler.""" logging.Handler.__init__(self) self.producer = KafkaProducer( bootstrap_servers=hostlist, value_serializer=lambda v: json.dumps(v).encode('utf-8'), linger_ms=10) self.topic = topic self.record = None def emit(self, record): """Emit the provided record to the kafka_client producer.""" # drop kafka logging to avoid infinite recursion if 'kafka.' in record.name: return try: # apply the logger formatter msg = self.format(record) self.producer.send( self.topic, { 't': int(time.time()), 'source': record.name, 'level': record.levelname, 'message': msg }) self.flush(timeout=1.0) except Exception: logging.Handler.handleError(self, record) def flush(self, timeout=None): """Flush the objects.""" self.producer.flush(timeout=timeout) def close(self): """Close the producer and clean up.""" self.acquire() try: if self.producer: self.producer.close() logging.Handler.close(self) finally: self.release()
class KafkaP: """ 生产模块:根据不同的key,区分消息 """ def __init__(self, bootstrap_servers, compression_type='gzip'): self.bootstrap_servers = bootstrap_servers self.retries = 3 self.ack = 0 self.linger_ms = 0 self.compression_type = compression_type if self.compression_type is None: self.producer = KafkaProducer( bootstrap_servers=self.bootstrap_servers, retries=self.retries, acks=self.ack, linger_ms=self.linger_ms, ) else: self.producer = KafkaProducer( bootstrap_servers=self.bootstrap_servers, retries=self.retries, acks=self.ack, linger_ms=self.linger_ms, compression_type=self.compression_type) def send_data(self, message, topic, key=None): self.producer.send(topic=topic, key=key, value=message) # print message def reconnection_producer(self): if self.compression_type is None: self.producer = KafkaProducer( bootstrap_servers=self.bootstrap_servers, retries=self.retries, acks=self.ack, linger_ms=self.linger_ms) else: self.producer = KafkaProducer( bootstrap_servers=self.bootstrap_servers, retries=self.retries, acks=self.ack, linger_ms=self.linger_ms, compression_type=self.compression_type) def close_producer(self): self.producer.flush() self.producer.close()
def process_SEC_rss(item): index_rss = 'http://www.sec.gov/Archives/edgar/monthly/xbrlrss-{}.xml'.format( item) producer = KafkaProducer(bootstrap_servers=kafka_url) rss_feed = urllib2.urlopen(index_rss) index_data = rss_feed.read() rss_feed.close() index_doc = xmltodict.parse(index_data) item_list = index_doc['rss']['channel']['item'] msg_count = 0 for entry in item_list: formType = entry['edgar:xbrlFiling']['edgar:formType'] filingInfo = entry['edgar:xbrlFiling'] if (formType == '10-Q' or formType == '10-K'): newRow = { 'companyName': get_value(filingInfo, 'edgar:companyName'), 'guid': get_value(entry, 'guid'), 'xml_filing': index_rss, 'pubDate': get_value(entry, 'pubDate'), 'formType': formType, 'filingDate': get_value(filingInfo, 'edgar:filingDate'), 'cikNumbver': get_value(filingInfo, 'edgar:cikNumber'), 'accessionNumber': get_value(filingInfo, 'edgar:accessionNumber'), 'fileNumber': get_value(filingInfo, 'edgar:fileNumber'), 'filingInfo': get_value(filingInfo, 'edgar:period'), 'fiscalYearEnd': get_value(filingInfo, 'edgar:fiscalYearEnd'), } # cols = newRow.keys() # vals = [newRow[x] for x in cols] # vals_str_list = ["%s"] * len(vals) # vals_str = ", ".join(vals_str_list) try: jsec = json.dumps(newRow) producer.send(topic_name, jsec) producer.flush() msg_count = msg_count + 1 print("Added {} sec filings".format(msg_count)) except e: print "Exception encountered {e}" metrics = producer.metrics() print metrics producer.close()
class KafkaEventHandler(EventHandlerInterface): """This class implements an event record listener, that will forward Json-objects to a Kafka queue.""" def __init__(self, analysis_context, topic, options): self.analysis_context = analysis_context self.options = options self.topic = topic self.producer = None self.kafka_imported = False def receive_event(self, _event_type, _event_message, _sorted_log_lines, event_data, _log_atom, event_source): """Receive information about a detected event in json format.""" if hasattr(event_source, 'output_event_handlers') and event_source.output_event_handlers is not None and self not in \ event_source.output_event_handlers: return True component_name = self.analysis_context.get_name_by_component( event_source) if component_name in self.analysis_context.suppress_detector_list: return True if self.kafka_imported is False: try: from kafka import KafkaProducer from kafka.errors import KafkaError self.producer = KafkaProducer( **self.options, value_serializer=lambda v: v.encode()) self.kafka_imported = True except ImportError: msg = 'Kafka module not found.' logging.getLogger(AminerConfig.DEBUG_LOG_NAME).error(msg) print('ERROR: ' + msg, file=sys.stderr) return False if not isinstance(event_data, str) and not isinstance( event_data, bytes): msg = 'KafkaEventHandler received non-string event data. Use the JsonConverterHandler to serialize it first.' logging.getLogger(AminerConfig.DEBUG_LOG_NAME).warning(msg) print('WARNING: ' + msg, file=sys.stderr) return False try: self.producer.send(self.topic, event_data) except KafkaError as err: msg = str(err) logging.getLogger(AminerConfig.DEBUG_LOG_NAME).error(msg) print("Error: " + msg, file=sys.stderr) self.producer.close() self.producer = None return False return True
def get_stats(UDP_IP, UDP_PORT): # The following strings are the linux commands cpu_cmd = 'grep cpu /proc/stat | awk \'{usage=($2+$4)*100/($2+$4+$5)} END {print usage "%"}\'' mem_cmd = 'free -tmh | grep Mem | tr -s \' \' | awk \'{print $3","$7","$2}\'' network_cmd = 'cat /proc/net/dev | tr -s \' \' | grep eth0 | awk \'{print $2","$10}\'' ps_cmd = 'ps' # Call cmd_output function to get the output of the executed commands cpu = cmd_output(cpu_cmd) mem = cmd_output(mem_cmd) network = cmd_output(network_cmd) ps = cmd_output(ps_cmd) parse_mem = mem.split(",") used_mem = parse_mem[0] available_mem = parse_mem[1] total_mem = parse_mem[2] parse_network = network.split(",") rcv_bytes = parse_network[0] trans_bytes = parse_network[1] # build json object data = {} data['Date'] = datetime.datetime.now().strftime("%y-%m-%d %H:%M:%S") data['CPU'] = cpu data['Memory_used'] = used_mem data['Memory_available'] = available_mem data['Network_received_bytes'] = rcv_bytes data['Network_transmitted_bytes'] = trans_bytes data['ps_command_output'] = str(ps) print(str(data)) json_data = json.dumps(data) # send data to Kafka consumer try: kafka_server = UDP_IP + ":" + str(UDP_PORT) producer = KafkaProducer(bootstrap_servers=kafka_server) producer.send('stats', json_data.encode('utf8')) producer.flush() producer.close() except Exception, e: print("Error occurred during data transmission process!") print(str(e)) pass
def push_to_pandas(df): import pygeohash from cassandra.cluster import Cluster from kafka import KafkaProducer import timeit cluster = Cluster() session = cluster.connect('xweather') producer = KafkaProducer(bootstrap_servers=['vm1:9092']) name=multiprocessing.current_process().name #df = pd.read_csv(filename) df1= df[['id','lat','lon','src','elev','timezone','tzoffset']].drop_duplicates() df1.src.fillna('NA') # Adding Geohash Id df1['geohash_id']=df.apply(lambda row:pygeohash.encode(row['lat'],row['lon']),axis=1) #Now loop through the Dataframe for row in df1.itertuples(): j = ','.join((row[8],str(row[1]),str(row[5]),row[8][:3],str(row[2]),str(row[3]),str(row[4]),str(row[6]),str(row[7]))) future = producer.send('topic-weather-stations',j) print('Completed insert into weather stations',name) #Now to the facts #Remove the descriptive columns df.drop(df.columns[[1,2,3,4,5,6]],axis=1,inplace=True) #Unpivot the dataset df=pd.melt(df,id_vars=['id','timestamp','dateTime']) df=df.dropna() # Kafka it ctr =0; producer = KafkaProducer(bootstrap_servers=['vm1:9092'],batch_size=20000,linger_ms=50,buffer_memory=952108864) #producer = KafkaProducer(bootstrap_servers=['vm1:9092']) start_time = timeit.default_timer() for row in df.itertuples(): k=list(row) k=k[1:] j= ','.join(str(x) for x in k) future = producer.send('topic-weather-data',j) ctr+=1 print('Producer timing is ', name,timeit.default_timer() - start_time,'Rows:',ctr) producer.flush() producer.close()
class KafkaProducerPipeline(object): def __init__(self, kafka_bootstrap_server): self.kafka_bootstrap_server = [] self.kafka_bootstrap_server.append(kafka_bootstrap_server) self.collection_name = 'articles' self.encoder = ScrapyJSONEncoder() @classmethod def from_crawler(cls, crawler): # pull in information from settings.py return cls(kafka_bootstrap_server=crawler.settings.get( 'KAFKA_BOOTSTRAP_SERVER'), ) def open_spider(self, spider): print("spider name: ", spider.name) # initializing py-Kafka producer self.producer = KafkaProducer( bootstrap_servers=self.kafka_bootstrap_server) print("kafka_bootstrap_server: ", self.kafka_bootstrap_server) if hasattr(spider, 'collection_name'): print("spider collection_name: ", spider.collection_name) self.collection_name = spider.collection_name def close_spider(self, spider): # clean up when spider is closed self.producer.flush(timeout=60) self.producer.close(timeout=60) def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: print("valid - inside process_item...", item['source'], ': ', item['headline']) # self.producer.send('articles', self.encoder.encode(item).encode()) key = str(ord(item['source'][0])) + str(ord(item['source'][1])) self.producer.send('articles', value=self.encoder.encode(item).encode(), key=key.encode()) self.index += 1 logging.debug("News item sent by Kafka Producer!") return item
def send_kafka(msg): try: producer = KafkaProducer(bootstrap_servers=bootstrap_servers, retries=retries) except Exception as e: logger.error(e) raise Exception("catch an exception when create KafkaProducer") try: producer.send(topic, msg) producer.flush() producer.close() except Exception as e: logger.error(e) if producer: producer.close() raise Exception("catch an exception when send message:%s" % msg)
def chat_input(): producer = KafkaProducer(bootstrap_servers=KAFKA_IP) try: chat_name = request.form['chat_name'] chat_text = request.form['chat_text'] chat_time = time.strftime('%d/%m/%Y %H:%M') if not chat_name: chat_name = 'Anonymous' msg_json = json.dumps({ "chat_name": chat_name, "chat_text": chat_text, "chat_time": chat_time }) producer.send(topic, msg_json.encode('utf-8')) return index() finally: producer.close()
def send2Kafka(msgs): if not msgs: return producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092') global count for msg in msgs: tmp = format_msg(msg) # print 'Send ==> ', tmp producer.send('topic_lpr', tmp.encode('utf8')) if count % 100 == 0: print u'==>[{}] {}'.format(count, tmp) producer.flush() count += 1 producer.flush() producer.close(timeout=5)
def run(self): producer = KafkaProducer(bootstrap_servers=['kafka: 9092']) url_col = -1 i = -1 with open(self.filename, 'rU') as csvfile: reader = csv.reader(csvfile,delimiter=',') for row in reader: i += 1 if i == 0: try: url_col = row.index("URL") except ValueError: print("ERROR: The csv must have a column header titled 'URL'") return continue producer.send('demo.incoming', '{"url": "'+row[url_col]+'", "appid":"testapp", "crawlid":"' + self.filename + "_" +str(i)+'", "spiderid":"parsing_link", "maxdepth": 2}') producer.close()
def send_data(value): producer = KafkaProducer(bootstrap_servers=['testdb.ibuildingsh.com:9092']) data = { 'source_id': '6012283db2296c000167711d', 'key': 'document-data', 'value': value } future = producer.send( topic='t-document-data', key='document-data'.encode('utf-8'), # key=str(datetime.now()).encode('utf-8'), value=json.dumps(data).encode('utf-8'), partition=0) producer.close()
def test_data(): producer = KafkaProducer(bootstrap_servers=['testdb.ibuildingsh.com:9092']) # , security_protocol="SSL" types = ['doc', 'pdf', 'ppt'] filetypes = ['施工', '运维', '进度', '方案'] usernames = ['我', '你', '你', '他'] data = { 'source_id': '6010c605186e4e0001e27ab1', 'key': 'test-file', 'value': [] } for i in range(10): cursize = np.random.rand() * 100 value = { 'filetype': np.random.choice(filetypes), 'filename': str(i + 300000), 'type': np.random.choice(types), 'sizeMb': int(cursize), 'sizeByte': int(cursize * 1024 * 1024), 'username': np.random.choice(usernames), 'timestamp': int( datetime(year=2021, month=1, day=np.random.randint(1, 19)).timestamp()) } data['value'].append(value) future = producer.send( topic='t-file-data-multi', key='file-test-multi'.encode('utf-8'), # key=str(datetime.now()).encode('utf-8'), value=json.dumps(data).encode('utf-8'), partition=0) # result = future.get(timeout=10) # print(data) producer.close()
class SenderKafka: def __init__(self, topic: str, bootstrap_servers): if isinstance(bootstrap_servers, str): bootstrap_servers = [bootstrap_servers] assert isinstance(bootstrap_servers, list) self._producer = KafkaProducer(bootstrap_servers=bootstrap_servers) self._closed = False self.topic = topic self.bootstrap_servers = bootstrap_servers def send(self, message, topic=None): if self._closed: log.error( 'The sender is closed, create a new one to send a message') return None if topic is None: topic = self.topic if isinstance(message, dict): try: message = json.dumps(message) except ValueError: log.error( 'Could not convert {} to json string'.format(message)) return None if isinstance(message, str): try: message = message.encode('utf-8') except ValueError: log.error('Could not convert {} to bytes'.format(message)) return None if isinstance(message, bytes): log.debug('Sending {}'.format(message)) result = self._producer.send(topic, message) self._producer.flush() return result else: log.error( 'The message must be either bytes, utf-8 string or a dict') return None def close(self): if not self._closed: self._closed = True self._producer.close()
def send_kafka_method1(): """ 发送方式一 发送并忘记(不关注是否正常到达,不对返回结果做处理) :return: """ producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS) start_time = time.time() for i in range(0, 10000): msg = 'echo %s' % i # print(msg) future = producer.send(TOPIC, msg.encode(), partition=0) # 将缓冲区的全部消息push到broker当中 producer.flush() producer.close() time_cost = time.time() - start_time print('发送耗时 %s 秒' % time_cost)
def send_message_group(): producer = KafkaProducer(bootstrap_servers=common.KAFKA_BROKET_LIST) for i in range(100): key = tobytes(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) value = tobytes("hadoop" if i % 3 == 0 else "hive") print(str(key) + "," + str(value)) producer.send("group_withwatermark", key=key, value=value) time.sleep(2) producer.flush() producer.close()
def send2Kafka(msgs): if not msgs: return producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092') global count for msg in msgs: tmp = format_msg(msg) #print 'Send ==> ', tmp producer.send('topic_taxi', tmp.encode('utf8')) if count%2000 == 0: print u'==>[{}] {}'.format(count, tmp) producer.flush() count += 1 producer.flush() producer.close(timeout=5)
def try_send(): producer = KafkaProducer(bootstrap_servers="ip-172-31-12-78.us-west-1.compute.internal:6667") # client = KafkaClient("ip-172-31-12-78.us-west-1.compute.internal:6667") # producer = SimpleProducer(client, async=True, batch_send_every_n = 100, batch_send_every_t = 60, random_start=False) # producer = SimpleProducer(client) # connect_str = 'ip-172-31-12-78.us-west-1.compute.internal:6667' # producer = KafkaProducer(bootstrap_servers=connect_str, # max_block_ms=10000, # value_serializer=str.encode) topic = '2008' with open('/home/ec2-user/data/2008.csv') as f: for line in f: producer.send(topic, line) producer.flush() producer.close()
def setUpClass(cls): cls.filename = os.path.join(os.getcwd(), "python_temp.pickle") cls.kafka_host = "localhost:9092" millis = int(round(time.time() * 1000)) cls.topic = "test_{}".format(millis) kafka_params = { "kafkaParam": {"topic": cls.topic, "bootstrap.servers": cls.kafka_host, "group_id": "group-jj", "debug": False}, "systemParam": {}, "internalSystemParam": {"stopFlagNum": 3}, "fitParam": {"alg": "RandomForestClassifier", "debug": "True"} } pickle.dump(kafka_params, open(cls.filename, "wb"), 2) producer = KafkaProducer(bootstrap_servers=cls.kafka_host) for i in xrange(10): producer.send(cls.topic, pickle.dumps("{}", 2)) producer.close()
def test_end_to_end(kafka_broker, compression): if compression == 'lz4': # LZ4 requires 0.8.2 if version() < (0, 8, 2): return # python-lz4 crashes on older versions of pypy elif platform.python_implementation() == 'PyPy': return connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)]) producer = KafkaProducer(bootstrap_servers=connect_str, retries=5, max_block_ms=30000, compression_type=compression, value_serializer=str.encode) consumer = KafkaConsumer(bootstrap_servers=connect_str, group_id=None, consumer_timeout_ms=30000, auto_offset_reset='earliest', value_deserializer=bytes.decode) topic = random_string(5) messages = 100 futures = [] for i in range(messages): futures.append(producer.send(topic, 'msg %d' % i)) ret = [f.get(timeout=30) for f in futures] assert len(ret) == messages producer.close() consumer.subscribe([topic]) msgs = set() for i in range(messages): try: msgs.add(next(consumer).value) except StopIteration: break assert msgs == set(['msg %d' % i for i in range(messages)]) consumer.close()
def test_end_to_end(kafka_broker, compression): if compression == 'lz4': # LZ4 requires 0.8.2 if version() < (0, 8, 2): return # LZ4 python libs dont work on python2.6 elif sys.version_info < (2, 7): return connect_str = 'localhost:' + str(kafka_broker.port) producer = KafkaProducer(bootstrap_servers=connect_str, retries=5, max_block_ms=10000, compression_type=compression, value_serializer=str.encode) consumer = KafkaConsumer(bootstrap_servers=connect_str, group_id=None, consumer_timeout_ms=10000, auto_offset_reset='earliest', value_deserializer=bytes.decode) topic = random_string(5) messages = 100 futures = [] for i in range(messages): futures.append(producer.send(topic, 'msg %d' % i)) ret = [f.get(timeout=30) for f in futures] assert len(ret) == messages producer.close() consumer.subscribe([topic]) msgs = set() for i in range(messages): try: msgs.add(next(consumer).value) except StopIteration: break assert msgs == set(['msg %d' % i for i in range(messages)])
def process_data(self, msg): result = 'ok' _data = msg['filename'] + ': ' + msg['data'] self.log.debug(msg['collectors'] + _data) producer = KafkaProducer(bootstrap_servers=self.kfk_server) future = producer.send(self.topic, _data) # Block for 'synchronous' sends try: record_metadata = future.get(timeout=10) except KafkaError: # Decide what to do if produce request failed... self.log.error(traceback.format_exc()) result = 'Fail' finally: producer.close() # return record_metadata.topic, record_metadata.partition, record_metadata.offset return result,
class SimpleProducer(BaseStreamProducer): def __init__(self, location, topic, compression): self._location = location self._topic = topic self._compression = compression self._create() def _create(self): self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, compression_type=self._compression) def send(self, key, *messages): for msg in messages: self._producer.send(self._topic, value=msg) def flush(self): self._producer.flush() def close(self): self._producer.close()
def run(argv): old_client = False if len(argv) > 4: test_duration = argv[1] msg_batch = argv[2] msg_requested_rate = argv[3] topic_name = argv[4] acks = argv[5] linger_ms = argv[6] msg_batch = int(msg_batch) msg_requested_rate = float(msg_requested_rate) test_duration = float(test_duration) topic_name = str(topic_name) acks = int(acks) linger_ms = int(linger_ms) # Initialize Kafka PUB Server l.info("Starting Kafka Publisher (producer)") # Estimate average message size to compute batch_size in [bytes] / Requested by Kafka min_message_size = len(str(0) + ' msg' + str(0)) max_message_size = len(str(msg_requested_rate) + ' msg' + str(msg_requested_rate)) average_message_size = (min_message_size + max_message_size) / 2 batch_estimated_size = (average_message_size) * msg_batch l.info("Message Average Size is: [%s]. Kafka Batch Size in Bytes set to: [%s]" % (average_message_size, batch_estimated_size)) if old_client: producer = KafkaProducer(bootstrap_servers=['localhost:9092'], batch_size=batch_estimated_size, linger_ms=linger_ms, acks=acks) else: client = KafkaClient(hosts='localhost:9092') topic = client.topics[topic_name] producer = topic.get_producer(min_queued_messages=batch_estimated_size, linger_ms=linger_ms, required_acks=acks) # Initialize simple Rep server, this is used to listen # for the signal to start sending data pub_rep_port = os.environ.get('PORT0') l.info("STARTING KAFKA REP server at port [%s].", pub_rep_port) run_data = {'start': False, 'stats': {'rate': 0, 'msg_cnt': 0}, 'test_status': 'stopped'} pub_metrics = {'test_duration': test_duration, 'msg_batch': msg_batch, 'msg_requested_rate': msg_requested_rate} hd = HDKafkapRepSrv(pub_rep_port, run_data, pub_metrics) hd.run() while True: # Wait for 'signal' to start sending messages to Kafka Broker if not run_data['start']: l.debug("KAFKA PUB WAITING FOR SIGNAL...") time.sleep(1) continue l.info('PUB server initiating... Test Duration [%f] secs. Messages with batches [%d]' 'and requested msg rate [%f]' % (hd.test_duration, hd.msg_batch, hd.msg_requested_rate)) cnt = 0 msg_cnt = 0 start_time = time.time() # Start Publishing Messages to Broker while True: # Build 'message' messagedata = "msg%d" % msg_cnt message = "%d %s" % (msg_cnt, messagedata) try: # Publish message to the Kafka Cluster # topic: specifies the 'topic' where the message will be published if old_client: producer.send(topic=topic_name, value=message) else: producer.produce(message) except KafkaTimeoutError as e: l.error("Unable to publish message to the Kafka Cluster. ERROR: %s" % e.message) # Insert a 'delay' if tx rate between batches outperforms the expected # (minimum) rate to achieve requested tx rate cnt += 1 msg_cnt += 1 if cnt >= hd.msg_batch: # Compute the delay duration = time.time() - start_time expected_time = msg_cnt / hd.msg_requested_rate delay = 0.0 if expected_time > duration: delay = expected_time - duration if delay > 1: delay = 1 time.sleep(delay) cnt = 0 elapsed_time = time.time() - start_time if elapsed_time >= hd.test_duration: break # Update 'stats' to 'hd' (HDaemon) run_data['stats']['time:end'] = json.dumps(time.time()) run_data['stats']['rate'] = msg_cnt / elapsed_time run_data['stats']['msg_cnt'] = msg_cnt process = psutil.Process() run_data['stats']['net:end'] = json.dumps(psutil.net_io_counters()) run_data['stats']['cpu:end'] = json.dumps(process.cpu_times()) run_data['stats']['mem:end'] = json.dumps(process.memory_info()) run_data['test_status'] = 'stopping' # Go back to waiting for the next test run_data['start'] = False continue producer.close() l.info("PUB Server stopping after sending %d messages elapsed time %f and message rate %f" % (msg_cnt, elapsed_time, run_data['stats']['rate']))
class KafkaSender(LogSender): def __init__(self, config, msg_buffer, stats): super().__init__(config=config, msg_buffer=msg_buffer, stats=stats, max_send_interval=config.get("max_send_interval", 0.3)) self.config = config self.msg_buffer = msg_buffer self.stats = stats self.kafka_producer = None topic = self.config["kafka_topic"] if isinstance(self.config["kafka_topic"], bytes): topic = topic.decode("utf8") self.topic = topic def _init_kafka(self): self.log.info("Initializing Kafka producer, address: %r", self.config["kafka_address"]) while self.running: try: if self.kafka_producer: self.kafka_producer = self.kafka_producer.close() self.kafka_producer = None producer_config = {"bootstrap_servers": self.config["kafka_address"], "security_protocol": "SSL" if self.config.get("ssl") else "PLAINTEXT", "ssl_certfile": self.config.get("certfile"), "ssl_keyfile": self.config.get("keyfile"), "ssl_cafile": self.config.get("ca"), "compression_type": "snappy" if snappy else None} self.kafka_producer = KafkaProducer(**producer_config) self.log.info("Initialized Kafka producer, address: %r", self.config["kafka_address"]) break except KAFKA_CONN_ERRORS as ex: self.log.warning("Retriable error during Kafka initialization: %s: %s, sleeping", ex.__class__.__name__, ex) self.kafka_producer.close() self.kafka_producer = None time.sleep(5.0) def send_messages(self, message_batch): if not self.kafka_producer: self._init_kafka() try: for message in message_batch: self.kafka_producer.send(self.topic, message) return True except KAFKA_CONN_ERRORS as ex: self.log.info("Kafka retriable error during send: %s: %s, waiting", ex.__class__.__name__, ex) time.sleep(0.5) self._init_kafka() except Exception as ex: # pylint: disable=broad-except self.log.exception("Unexpected exception during send to kafka") self.stats.unexpected_exception(ex=ex, where="sender", tags={"app": "journalpump"}) time.sleep(5.0) self._init_kafka() def _cleanup(self): if self.kafka_producer: self.kafka_producer.close()
class KafkaSource(StoqSourcePlugin): def __init__(self): super().__init__() def activate(self, stoq): self.stoq = stoq super().activate() self.producer = None def ingest(self): """ Monitor Kafka for messages """ # Define our Kafka topic topic = self.stoq.worker.name # If this is an error message, let's make sure our topic # has "-errors" affixed to it if self.stoq.worker.error_queue is True: topic = topic + "-errors".strip() consumer = KafkaConsumer(topic, group_id=self.group, auto_offset_reset='earliest', bootstrap_servers=self.servers_list) self.log.info("Monitoring {} topic for messages...".format(topic)) for message in consumer: # Setup the amqp message for parsing msg = self.stoq.loads(message.value) # Send the message to the worker self.stoq.worker.multiprocess_put(**msg) def producer_connect(self): """ Connect to Kafka to publish a message """ self.producer = KafkaProducer(bootstrap_servers=self.servers_list, retries=self.retries) def producer_release(self): """ Release AMQP connection used for publishing """ return self.producer.close() def publish(self, msg, topic, err=False, **kwargs): """ Publish a message to Kafka :param dict msg: Message to be published :param str topic: Topic to be used, should be name of worker :param bool err: Define whether we should process error topic """ # Make sure we have a valid connection to RabbitMQ if not self.producer: self.producer_connect() # If this is an error message, let's make sure our queue # has "-errors" affixed to it if err: topic = topic + "-errors".strip() try: self.producer.send(topic, self.stoq.dumps(msg).encode()) except: self.log.error("Unable to publish message to Kafka server: {}".format(msg))
class KafkaPipeline(BasePipeline): TOPIC = 'craigslist' SERIALIZER = MsgPackSerializer() def start(self, crawler): # TODO: remove this hack # HACK log.debug("Wait 5s to allow kafka node to be ready") time.sleep(5) endpoints = list(get_kafka_endpoints()) log.debug("Connect to kafka as producer - %s", endpoints) if not endpoints: raise RuntimeError("Kafka endpoints not defined") self.producer = KafkaProducer(bootstrap_servers=endpoints) def process(self, crawler, item): self.producer.send( self.TOPIC, self.SERIALIZER.dumps(item), ) return item def stop(self, crawler): self.producer.flush() self.producer.close() @classmethod def dump_data( cls, topic=None, timeout=None, poll_timeout=None, enable_auto_commit=False): # TODO: remove this hack # HACK log.debug("Wait 5s to allow kafka node to be ready") time.sleep(5) topic = topic or cls.TOPIC endpoints = list(get_kafka_endpoints()) log.debug("Connect to kafka as consumer - %s", endpoints) if not endpoints: raise RuntimeError("Kafka endpoints not defined") consumer = KafkaConsumer( topic, auto_offset_reset='earliest', enable_auto_commit=enable_auto_commit, value_deserializer=cls.SERIALIZER.loads, bootstrap_servers=endpoints, consumer_timeout_ms=timeout or -1, ) # TODO use native kafka-python poll if poll_timeout: while True: yield list(data.value for data in consumer) time.sleep(poll_timeout / 1000.0) else: for data in consumer: yield data.value consumer.close() @classmethod def dump_to_csv(cls, to_file, topic=None, timeout=None): log.debug("Dump topic <%s> to %s", topic, to_file) csv_pipeline = CsvPipeline(to_file) csv_pipeline.start(None) for item in cls.dump_data(topic, timeout): # we must reinitialize item to restore fields and values ordering csv_pipeline.process( None, CraigsListItem(**dict( # convert dict byte keys to string keys and use it as # keywords (k.decode(), v) for k, v in item.items() )) ) csv_pipeline.stop(None)
# producer import time from kafka import KafkaProducer producer = KafkaProducer(bootstrap_servers=['localhost:9092']) #此处ip可以是多个['0.0.0.1:9092','0.0.0.2:9092','0.0.0.3:9092' ] for i in range(300): ts =int(time.time()*1000) msg = "produce yao + msg%d" % i print(msg) producer.send("test", msg.encode('utf-8')) time.sleep(1) producer.close()
from kafka import KafkaProducer import avro.schema import io from avro.io import DatumWriter data = {'name': 'Tony', 'favorite_number': 8, 'favorite_color': 'green'} schema = avro.schema.parse(open('./schema.avsc').read()) def serialize(data): writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(data, encoder) return bytes_writer.getvalue() producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=serialize) producer.send('test2', data) producer.flush() producer.close() # close will also flush, but I'm leaving it in here for demonstration purposes
class DocManager(DocManagerBase): def __init__(self, url, auto_commit_interval=1, unique_key='_id', chunk_size=10): try: from kafka import KafkaProducer except ImportError: raise SystemError self.producer = KafkaProducer(bootstrap_servers=[url]) def get_topic_key(self, namespace): if namespace == 'timetracker.session': topic_key = 'session' else: topic_key = 'activity' return topic_key def doc_to_message_data(self, doc, namespace, timestamp): data = {'timestamp': timestamp, 'namespace': namespace, 'action': 'upsert', 'data': doc} return str(data) def update_to_message_data(self, doc_id, update_spec, namespace, timestamp): data = {'timestamp': timestamp, 'namespace': namespace, 'document_id': doc_id, 'action': 'update', 'data': update_spec['$set'] } return str(data) def remove_to_message_data(self, doc_id, namespace, timestamp): data = {'timestamp': timestamp, 'namespace': namespace, 'document_id': doc_id, 'action': 'remove' } return str(data) def publish_data(self, data, topic_key): self.producer.send(TOPICS[topic_key], data) self.producer.flush() def stop(self): self.producer.close() def upsert(self, doc, namespace, timestamp): data = self.doc_to_message_data(doc, namespace, timestamp) topic_key = self.get_topic_key(namespace) self.publish_data(data, topic_key) def update(self, document_id, update_spec, namespace, timestamp): data = self.update_to_message_data(document_id, update_spec, namespace, timestamp) topic_key = self.get_topic_key(namespace) self.publish_data(data, topic_key) def remove(self, document_id, namespace, timestamp): data = self.remove_to_message_data(document_id, namespace, timestamp) topic_key = self.get_topic_key(namespace) self.publish_data(data, topic_key) def search(selfself, start_ts, end_ts): raise NotImplementedError def commit(self): raise NotImplementedError def get_last_doc(self): raise NotImplementedError def handle_command(self, doc, namespace, timestamp): pass
class JournaldStream(object): messages_steps = 100 logs_topic_name = "logs" kafka_sleep = 1 def __init__(self, kafka_hosts, journald_path, sincedb_path): # Sincedb is a file where the __CURSOR of Journald is stored self.sincedb_path = self._force_type_value(str, sincedb_path) self._read_or_create_sincedb(self.sincedb_path) # /run/log/journal self.journald_path = self._force_type_value(str, journald_path) self._is_journal_dir(self.journald_path) self.reader = journal.Reader(path=self.journald_path, converters=BASIC_CONVERTERS) # Kafka hosts self.kafka_hosts = self._force_type_value(list, kafka_hosts) self.producer = KafkaProducer( bootstrap_servers=self.kafka_hosts, value_serializer=lambda v: json.dumps(v)) self.cursor = "" self.read_messages = 0 self.key_filters = self._build_key_filters() self.value_filters = lambda x: x @staticmethod def _read_or_create_sincedb(sincedb_path): if os.path.isfile(sincedb_path): with open(sincedb_path, 'r') as db: db.read() else: with open(sincedb_path, "w") as empty_db: empty_db.write("") @staticmethod def _is_journal_dir(journald_path): if not os.path.isdir(journald_path): raise IOError("%s not here" % journald_path) @staticmethod def _build_key_filters(): """ Transform the keys of a dict :return: list of functions """ def remove_prefix(key, prefix="_"): """ Journald create keys with '_', '__' prefix :param key: :param prefix: :return: Key reformatted """ new = key while new[0] == prefix: new = new[1:] return new def lower_key(key): return key.lower() def aggregate_filters(key): for f in [remove_prefix, lower_key]: key = f(key) return key return aggregate_filters @staticmethod def _force_type_value(type_want, variable): """ Raise TypeError is the type is not matching :param type_want: :param variable: :return: variable """ if type_want is not type(variable): raise TypeError("%s is not type(%s)" % (type_want, type(variable))) return variable def _save_cursor(self): if self.cursor != "": with open(self.sincedb_path, 'w') as f: f.write(self.cursor) else: os.write(2, "invalid cursor\n") def _get_cursor(self): try: with open(self.sincedb_path, 'r') as f: self.cursor = f.read() return True if self.cursor else False except IOError: return False def _stream_to_seek(self): if self._get_cursor(): os.write(1, "using saved cursor \"%s\"\n" % self.cursor) self.reader.seek_cursor(self.cursor) self.reader.get_next() else: os.write(1, "using new cursor\n") for log in self.reader: self._kafka_send(log) os.write(1, "seeked journal after %d messages\n" % self.read_messages) def _stream_poller(self): i = 0 os.write(1, "start polling realtime messages\n") while self.reader.get_events(): i += 1 if self.reader.process() == journal.APPEND: for log in self.reader: self._kafka_send(log) else: time.sleep(self.kafka_sleep) self._periodic_stream_task(i) def stream(self): """ Public method """ self._stream_to_seek() self._stream_poller() def _periodic_send_task(self): if self.read_messages % self.messages_steps == 0: os.write(1, "read %d messages, process flush\n" % self.read_messages) ts = time.time() self.producer.flush() os.write(1, "flush done in %d\n" % (time.time() - ts)) @staticmethod def _periodic_stream_task(nb_message): pass def _filters(self, full_log): # Keys filter_data = {self.key_filters(k): self.value_filters(v) for k, v in full_log.iteritems()} # Values # Handle by BASIC_CONVERTERS Journal builtin return filter_data def _kafka_send(self, full_log): # Transform the log filter_data = self._filters(full_log) # Send it to Kafka self.producer.send(self.logs_topic_name, filter_data) # Save the cursor self.cursor = full_log["__CURSOR"] self._save_cursor() # Internal instance stats self.read_messages += 1 self._periodic_send_task() def close(self): os.write(1, "closing journald.Reader\n") self.reader.close() os.write(1, "closing kafka connection\n") self.producer.close()
def run(args): try: props = {} for prop in args.consumer_config: k, v = prop.split('=') try: v = int(v) except ValueError: pass if v == 'None': v = None props[k] = v if args.brokers: brokers = start_brokers(args.brokers) props['bootstrap_servers'] = ['{0}:{1}'.format(broker.host, broker.port) for broker in brokers] print('---> bootstrap_servers={0}'.format(props['bootstrap_servers'])) print() print('-> Producing records') record = bytes(bytearray(args.record_size)) producer = KafkaProducer(compression_type=args.fixture_compression, **props) for i in xrange(args.num_records): producer.send(topic=args.topic, value=record) producer.flush() producer.close() print('-> OK!') print() print('Initializing Consumer...') props['auto_offset_reset'] = 'earliest' if 'consumer_timeout_ms' not in props: props['consumer_timeout_ms'] = 10000 props['metrics_sample_window_ms'] = args.stats_interval * 1000 for k, v in props.items(): print('---> {0}={1}'.format(k, v)) consumer = KafkaConsumer(args.topic, **props) print('---> group_id={0}'.format(consumer.config['group_id'])) print('---> report stats every {0} secs'.format(args.stats_interval)) print('---> raw metrics? {0}'.format(args.raw_metrics)) timer_stop = threading.Event() timer = StatsReporter(args.stats_interval, consumer, event=timer_stop, raw_metrics=args.raw_metrics) timer.start() print('-> OK!') print() records = 0 for msg in consumer: records += 1 if records >= args.num_records: break print('Consumed {0} records'.format(records)) timer_stop.set() except Exception: exc_info = sys.exc_info() traceback.print_exception(*exc_info) sys.exit(1)