def main(): producer = KafkaProducer(bootstrap_servers=config.KAFKA_SERVERS, value_serializer=lambda v: json.dumps(v).encode()) headers = [ 'CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM', 'TRANSACTION_TP', 'ENTITY_TP', 'NAME', 'CITY', 'STATE', 'ZIP_CODE', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT', 'TRANSACTION_AMT', 'OTHER_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT', 'SUB_ID' ] with open('/home/ubuntu/2016/by_date/itcont_2016_10151005_20150726.txt' ) as f: data = f.readlines() data = [x.split("|") for x in data] #producer = KafkaProducer(bootstrap_servers = '18.205.181.166:9092',value_serializer = lambda v: json.dumps(v).encode('utf-8')) for row in data: row = {h: x for h, x in zip(headers, row)} #print row producer.send('data', row) #print row producer.flush() producer = KafkaProducer(retries=5)
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, **kwargs): logging.Handler.__init__(self) self.kafka_topic_name = topic self.producer = KafkaProducer(bootstrap_servers=hosts_list) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) msg = str.encode(msg) self.producer.send(self.kafka_topic_name, msg) self.producer.flush() except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) def close(self): if self.producer is not None: self.producer.close() logging.Handler.close(self)
def run(self): producer = KafkaProducer(bootstrap_servers='DNS from master:9092') for i in x: producer.send('my_topic', b'i') producer.flush() time.sleep(0.5) while not self.stop_event.is_set(): s3 = boto3.client('s3') #low-level functional API my_bucket = resource.Bucket(BUCKET_NAME) for datum in docs: self.producer.send('docs', datum) time.sleep(1) producer.close()
def main(): producer = KafkaProducer(bootstrap_servers=["localhost:9092"],value_serializer= lambda v: json.dumps(v).encode()) headers = ['CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM', \ 'TRANSACTION_TP','ENTITY_TP','NAME', 'CITY', 'STATE', 'ZIP_CODE', \ 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT','TRANSACTION_AMT', \ 'OTHER_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT', 'SUB_ID' ] split_counter = len(glob.glob('/home/ubuntu/manip_data/split_*')) for j in range(1): for i in range(split_counter): with open('/home/ubuntu/manip_data/split_'+file_number(i)) as f: reader = csv.reader(f, delimiter='|') for row in reader: row = {h:x for h,x in zip(headers,row)} producer.send('datatwo', row) producer.flush() producer.close()
def send_avro_record_to_kafka(topic, value, bootstrap_servers, avro_schema_json): value_schema = avro.schema.parse(avro_schema_json) producer = KafkaProducer(bootstrap_servers=bootstrap_servers) writer = DatumWriter(value_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(value, encoder) try: producer.send(topic=topic, value=bytes_writer.getvalue()) except Exception as e: print( f"Exception while producing record value - {value} to topic - {topic}: {e}" ) else: print( f"Successfully producing record value - {value} to topic - {topic}" ) producer.flush()
class KafkaEventProducer: FLUSH_PER = 10000 gobmodel = GOBModel() def __init__(self, catalogue: str, collection: str, logger): self.catalogue = catalogue self.collection = collection self.logger = logger self.gob_db_session = None self.db_session = None self.gob_db_base = None self.Event = None self.producer = None self.total_cnt = 0 self._init_connections() self.event_builder = EventDataBuilder(self.gob_db_session, self.gob_db_base, catalogue, collection) def _get_tables_to_reflect(self): """Returns tables to reflect: - events - object table (e.g. gebieden_buurten) - relation tables (e.g. rel_gb_brt_gbd_wijk_ligt_in_wijk, ...) :return: """ relations = get_relations_for_collection(self.gobmodel, self.catalogue, self.collection) relation_tables = [ self.gobmodel.get_table_name('rel', rel_table) for rel_table in relations.values() ] return [ 'events', self.gobmodel.get_table_name(self.catalogue, self.collection) ] + relation_tables def _init_gob_db_session(self): """Inits db session for gob db (to access events) :return: """ engine = create_engine(URL(**GOB_DATABASE_CONFIG), connect_args={'sslmode': 'require'}) self.gob_db_session = Session(engine) meta = MetaData() meta.reflect(engine, only=self._get_tables_to_reflect()) base = automap_base(metadata=meta) base.prepare() self.Event = base.classes.events self.gob_db_base = base self.logger.info("Initialised events storage") def _init_local_db_session(self): """Inits db session for local (gob_kafka) db :return: """ engine = create_engine(URL(**DATABASE_CONFIG), connect_args={'sslmode': 'require'}) Base.metadata.bind = engine self.db_session = Session(engine) def _init_kafka(self): self.producer = KafkaProducer( **KAFKA_CONNECTION_CONFIG, max_in_flight_requests_per_connection=1, # With retries, max_in_flight should always be 1 to ensure ordering of batches! retries=3) self.logger.info("Initialised Kafka connection") def _init_connections(self): self._init_gob_db_session() self._init_local_db_session() self._init_kafka() def _get_last_event(self): last_event = self.db_session \ .query(LastSentEvent) \ .filter_by(catalogue=self.catalogue, collection=self.collection) \ .first() return last_event def _get_last_eventid(self): last_event = self._get_last_event() return last_event.last_event if last_event else -1 def _set_last_eventid(self, eventid: int): last_event = self._get_last_event() if last_event: last_event.last_event = eventid else: last_event = LastSentEvent(catalogue=self.catalogue, collection=self.collection, last_event=eventid) self.db_session.add(last_event) self.db_session.commit() def _get_events(self, min_eventid: int): return self.gob_db_session \ .query(self.Event) \ .yield_per(10000) \ .filter(and_(self.Event.catalogue == self.catalogue, self.Event.entity == self.collection, self.Event.eventid > min_eventid)) \ .order_by(self.Event.eventid.asc()) def _add_event(self, event): header = { 'event_type': event.action, 'event_id': event.eventid, 'tid': event.tid, 'catalog': event.catalogue, 'collection': event.entity, } headers = [(k, _to_bytes(str(v)) if v else b'') for k, v in header.items()] data = self.event_builder.build_event(event.tid) self.producer.send(KAFKA_TOPIC, key=_to_bytes(header['tid']), value=_to_bytes(json.dumps(data)), headers=headers) def _flush(self, last_eventid: int): self.producer.flush(timeout=120) self._set_last_eventid(last_eventid) print( f"Flushed Kafka events. Total events: {self.total_cnt}. Last event id: {last_eventid}" ) def produce(self): last_eventid = self._get_last_eventid() self.logger.info(f"Start producing. Last event was {last_eventid}") events = self._get_events(last_eventid) for event in events: self._add_event(event) self.total_cnt += 1 last_eventid = event.eventid self.gob_db_session.expunge(event) if self.total_cnt % self.FLUSH_PER == 0: self._flush(last_eventid) self._flush(last_eventid) self.logger.info(f"Produced {self.total_cnt} Kafka events")
from kafka.producer import KafkaProducer import json import csv from time import sleep producer = KafkaProducer( bootstrap_servers='localhost:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8')) with open('blacklist.csv') as file: reader = csv.DictReader(file, delimiter=",") for row in reader: producer.send(topic='blacklist', value=row) producer.flush() sleep(1)
class AIMSDownsamplingTCPServerConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" INTERVAL = 60 DELAY = 0 MAX_CONNECTION = 32 def __init__(self, kafka_host, kafka_port, tcp_host, tcp_port, topic, log_topic): self.kafka_host = kafka_host self.kafka_port = kafka_port self.tcp_host = tcp_host self.tcp_port = tcp_port self.topic = topic self.log_topic = log_topic self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)], enable_auto_commit=False, max_poll_records=1024 * 1024, max_partition_fetch_bytes=1024 * 1024 * 100) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)]) self.connections = {} self.sample_end_time = self.get_end_time(time()) self.lastPolled = [] def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.connection_handler, self.tcp_host, self.tcp_port) await asyncio.gather(tcpServer.serve_forever(), self.poll_from_kafka()) async def connection_handler(self, reader, writer): addr = str(writer.get_extra_info("peername")) addr = str(writer.get_extra_info("peername")) # A new connection, but we can accept no more if addr not in self.connections and \ len(self.connections)>=self.MAX_CONNECTION: self.refuse_client(addr, writer) return # Add connection self.add_client(addr, writer) # Read data from connection remaining_data = b"" try: while True: data = await reader.read(1) # 1024*8 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: self.remove_client(addr) async def poll_from_kafka(self): polled = self.consumer.poll(timeout_ms=self.INTERVAL * 1000 / 2) self.lastPolled = polled while True: t = time() if t >= self.sample_end_time + self.DELAY: polled = self.consumer.poll(timeout_ms=self.INTERVAL * 1000 / 2) lastPolled = self.lastPolled start_time = self.sample_end_time - self.INTERVAL end_time = self.sample_end_time self.lastPolled = polled self.sample_end_time = self.get_end_time(time()) if len(self.connections) != 0: # run on lastPolled first to hit the cache parsed_records = self.get_parsed_records(lastPolled) + \ self.get_parsed_records(polled) parsed_records = list( filter( lambda rec: rec["observation_date_time"] is not None, parsed_records)) ds_records = self.down_sample(parsed_records, start_time, end_time) messages = [rec["message"] for rec in ds_records] for addr in self.connections.keys(): await self.send_or_ignore_message(addr, messages) await asyncio.sleep(0.1) def get_parsed_records(self, polled): # Create cache if "_get_parsed_records__polled" not in self.__dict__: self._get_parsed_records__polled = [] if "_get_parsed_records__ret" not in self.__dict__: self._get_parsed_records__ret = [] # Cache hit if self._get_parsed_records__polled == polled: return self._get_parsed_records__ret # Cache not hit self._get_parsed_records__polled = polled self._get_parsed_records__ret = [] records = [] for recordList in polled.values(): records.extend([rec.value for rec in recordList]) for rec in records: self._get_parsed_records__ret.append(self.parse_hl7(rec)) return self._get_parsed_records__ret def parse_hl7(self, message): segments = message.decode(errors="ignore") \ .strip() \ .split(MESSAGE_SEGMENT_END_BYTE) location = None date_time = None observation_types = [] observation_type = None for seg in segments: fields = seg.split('|') if fields[0] == "PV1": try: location = fields[3] except IndexError: pass if fields[0] == "OBR": try: date_time = mktime(strptime(fields[7], "%Y%m%d%H%M%S")) except IndexError: pass if fields[0] == "OBX": try: observation_types.append(fields[13]) except IndexError: observation_types.append(None) observation_type_set = set(observation_types) if len(observation_type_set)==1 and \ "APERIODIC" in observation_type_set: observation_type = "aperiodic" if len(observation_type_set)==1 and \ None in observation_type_set: observation_type = "default" return { "assigned_patient_location": location, "observation_date_time": date_time, "observation_type": observation_type, "message": message } def down_sample(self, parsed_records, start_time, end_time): dt = [rec["observation_date_time"] for rec in parsed_records] records = [] tmp = {} sorted_records = sorted(parsed_records, key=lambda rec: rec["observation_date_time"]) for rec in sorted_records: date_time = rec["observation_date_time"] location = rec["assigned_patient_location"] observation_type = rec["observation_type"] message = rec["message"] if date_time<start_time or \ date_time>=end_time: continue tmp[location] = tmp.get(location, {}) tmp[location][observation_type] = rec for d in tmp.values(): for rec in d.values(): records.append(rec) return records def log(self, msg): self.producer.send( self.log_topic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def get_end_time(self, current_time): interval = self.INTERVAL return current_time - current_time % interval + interval async def send_or_ignore_message(self, addr, messages): writer = self.connections[addr] try: for msg in messages: writer.write(msg) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ self.remove_client(addr) def refuse_client(self, addr, writer): self.log("{} refused".format(addr)) writer.close() def add_client(self, addr, writer): if addr not in self.connections: self.log("{} accepted".format(addr)) self.connections[addr] = writer else: self.remove_client(addr) self.add_client(addr) def remove_client(self, addr): if addr in self.connections: self.log("{} closed".format(addr)) writer = self.connections.pop(addr) try: writer.close() except ConnectionResetError: pass def cleanup(self): self.log("shutdown") for addr in self.connections.keys(): self.remove_client(addr) self.producer.flush() self.producer.close()
class IBUSStreamingDownsamplingConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic, logTopic, interval): self.kafkaHost = kafkaHost self.kafkaPort = kafkaPort self.tcpHost = tcpHost self.tcpPort = tcpPort self.group_id = group_id self.topic = topic self.logTopic = logTopic self.interval = int(interval) self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)], group_id=group_id, enable_auto_commit=False) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)]) self.tcpWriter = None def getTopicPartitions(self): self.consumer.topics() #This ensures local cache is updated with # information about partitions, offsets etc. pids = self.consumer.partitions_for_topic(self.topic) tps = [TopicPartition(self.topic, pid) for pid in pids] return tps def getTopicPartitionsCommittedPositions(self): tps = self.getTopicPartitions() ret = [(tp, self.consumer.committed(tp)) for tp in tps] return ret async def tcp_server_handler(self, reader, writer): addr = str(writer.get_extra_info("socket").getpeername()) if self.tcpWriter is not None: self.log("refused " + addr) writer.write(b"Connection limit reached; connection refused.") writer.close() return self.log("accepted " + addr) self.tcpWriter = writer t1 = asyncio.create_task(self.poll_from_Kafka(writer)) try: while True: data = await reader.read(1) # 1024*16 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: t1.cancel() self.log("closed " + addr) writer.close() self.tcpWriter = None async def poll_from_Kafka(self, writer): while True: prevPos = self.getTopicPartitionsCommittedPositions() polled = self.consumer.poll(timeout_ms=1000) records = [ record.value for recordList in polled.values() for record in recordList ] try: for record in records: writer.write(record) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ print("Last batch not fully sent, not commited.") for tp, pos in prevPos: self.consumer.seek(tp, pos) break else: self.consumer.commit() await asyncio.sleep(self.interval) def log(self, msg): self.producer.send( self.logTopic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def cleanup(self): self.log("shutdown") self.consumer.close() self.producer.flush() self.producer.close() def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.tcp_server_handler, self.tcpHost, self.tcpPort) await tcpServer.serve_forever()