class Producer(): def __init__(self): self.producer = KafkaProducer(bootstrap_servers=["52.41.44.90:9092","52.36.206.57:9092","52.40.205.225:9092"],acks=0,linger_ms=500) def produce_msgs(self,msg_list): while True: index = random.randrange(0,999) json_msg =json.dumps(msg_list[index]).encode('utf-8') self.producer.send(topic, json_msg)
class Producer(): def __init__(self): self.producer = KafkaProducer(bootstrap_servers=["52.41.44.90:9092","52.36.206.57:9092","52.40.205.225:9092"],acks=0,linger_ms=500) def produce_msgs(self,msg_list): while True: index = random.randrange(0,999) info = person_pb2.PersonInfo() serialize_protobuf (info.user.add(), msg_list[index]) _msg = user.SerializeToString() self.producer.send(topic, _msg)
class Producer(): def __init__(self): self.producer = KafkaProducer(bootstrap_servers = 'localhost:9092') def produce_msgs(self,source): for drop in source: if 'text' in drop: message = json.dumps(drop) self.producer.send('Twitter-Stream',message) print(message) self.producer.send('message-size',len(message))
def on_status(self, status): print status.text, "\n" #~ data ={} #~ data['text'] = status.text #~ data['created_at'] = status.created_at #~ data['geo'] = status.geo #~ data['source'] = status.source #~ self.db.Tweets.insert(data) msg = status.text.encode('utf-8') producer = KafkaProducer(bootstrap_servers='0.0.0.0:9092') #~ producer = KafkaProducer(bootstrap_servers=['broker1:1234']) #print(msg) try: producer.send(b'twitterstream', msg) except Exception as err: print(err) return False return True
def main(): s3 = boto3.resource('s3') bucket = s3.Bucket('nyc-tlc') # Iterates through all the objects, doing the pagination for you. Each obj # is an ObjectSummary, so it doesn't contain the body. You'll need to call # get to get the whole body. kafka_params = config('kafka') dataset_params = config('dataset') producer = KafkaProducer(bootstrap_servers=kafka_params['broker']) for obj in bucket.objects.all(): key = obj.key print(key) if dataset_params['driver'] not in key: continue #building absolute file name file_name = 's3://nyc-tlc/' + key #skipping header firstline = True # Processing each row in file for line in smart_open(file_name): # print(line.decode('utf8')) if firstline: # skip first line firstline = False continue line_split = line.decode('utf8').split(",") if len(line_split ) < 20: #Skipping rows with large number of columns continue if line_split[5] == '0' or line_split[6] == '0' or line_split[ 7] == '0' or line_split[8] == '0': continue else: start_point = (float(line_split[5]), float(line_split[6])) end_point = (float(line_split[7]), float(line_split[8])) # print(start_point, end_point) intermediate_points = getEquidistantPoints( start_point, end_point, 100) # print(intermediate_points) #message when trip is started trip_id = 'drive:' + str(datetime.now()) + ":" + str( random.randint(1, 1000)) formatted_message = format_message(trip_id, start_point, start_point, end_point, "New") producer.send(kafka_params['driver_topic'], formatted_message.encode('utf8 ')) #Simulating moving car by sending intermediate points for int_point in intermediate_points: # print(int_point) formatted_message = format_message(trip_id, start_point, int_point, end_point, "In Progress") producer.send(kafka_params['driver_topic'], formatted_message.encode('utf8 ')) #Ending the driver trip formatted_message = format_message(trip_id, start_point, end_point, end_point, "Closed") print(formatted_message.encode('utf8 ')) producer.send(kafka_params['driver_topic'], formatted_message.encode('utf8 '))
row_i = 0 time.sleep(config.PRODUCER_SLEEP_TIME) #print('column:', column) for row in df[column]: #print('row: ', row) #print('col:',column) #timestamp = datetime.now().strftime("%H%M%S%f") timestamp_s = float(datetime.now().strftime("%M")) * 60 + float( datetime.now().strftime("%S.%f")) longitude = row.split(',')[0] latitude = row.split(',')[1] #user_id = filenames[row_i] #message_to_send = str_fmt.format(users[row_i],, timestamp, longitude, latitude, int(column==0)) message_to_send = str_fmt.format(users[row_i], timestamp_s, longitude, latitude, int(column == 0)) #print(users[row_i], message_to_send) producer.send(topic=config.KAFKA_TOPIC, value=message_to_send, key=users[row_i].encode('utf-8')) row_i += 1 end_time = float(datetime.now().strftime("%M")) * 60 + float( datetime.now().strftime("%S.%f")) print('end time: ', end_time) total_time = end_time - start_time messages_per_second = (num_rows * num_files) / total_time print('total_time: ', total_time) print('messages_per_second: ', messages_per_second)
logger.info( "Start MongoDB Change Streams Service for table {}...".format(col_tmp)) with col.watch(pipeline, full_document='updateLookup') as stream: for change in stream: update_operations = list() #delete_operations = list() logger.info(change) msg = str(change.get('documentKey').get('_id')) + ',' + str( change.get('clusterTime').time) topic = change.get('ns').get('coll') producer = KafkaProducer(bootstrap_servers=['172.16.42.3:9092']) producer.send(topic, key=bytes( json.dumps(change.get('ns')).encode('utf-8')), value=bytes(json.dumps(msg).encode('utf-8')), partition=0) producer.close() record = change.get('fullDocument') record.pop('_id') record.pop('createdTime') update_op = UpdateOne({'id': record['id']}, { '$set': record, '$setOnInsert': { 'createdTime': datetime.now().strftime('%Y-%m-%dT%H:%M:%S') } }, upsert=True) #delete_op = DeleteOne({'id':record['id']}) update_operations.append(update_op)
import sys from kafka.producer import KafkaProducer config = { 'access_key': 'your_access_key_here', 'access_secret': 'your_access_secret_key_here', 'consumer_key': 'your_consumer_key_here', 'consumer_secret': 'your_consumer_secret_key_here' } auth = OAuth(config["access_key"], config["access_secret"], config["consumer_key"], config["consumer_secret"]) stream = TwitterStream(domain='userstream.twitter.com', auth=auth, secure=True) search_term = "coffee, cappuccino, espresso, frappuccino, mocha, \ tea, matcha, chai, oolong, pu-erh, tisane, \ milk, dairy, half-and-half, \ soda, coke, cola, fanta, sprite, pepsi, Dr pepper, soft drink, \ juice, oj, cider, \ wine, riesling, merlot, syrah, chardonnay, sauvignon, pinot noir, \ beer, brew, amber, ipa, bud light, budweiser, miller lite, corona extra, heineken, \ liquor, sake, shochu, whisky, tequila, gin, cognac, rum" tweet_iter = stream.statuses.filter(track=search_term, language='en') producer = KafkaProducer(bootstrap_servers='your_aws_cluster_public_IP:9092') for tweet in tweet_iter: print(tweet) producer.send('insight_topic', json.dumps(tweet))
class MyKafkaProducer(object): """ class that implements Kafka producers that ingest data from S3 bucket """ def __init__(self, kafka_configfile, schema_file, s3_configfile): """ class constructor that initializes the instance according to the configurations of the S3 bucket and Kafka :type kafka_configfile: str path to kafka config file :type schema_file : str path to schema file :type s3_configfile : str path to S3 config file """ self.kafka_config = utility.parse_config(kafka_configfile) self.schema = utility.parse_config(schema_file) self.s3_config = utility.parse_config(s3_configfile) self.producer = KafkaProducer( bootstrap_servers=self.kafka_config["BROKERS_IP"]) def get_key(self, msg): """ produces key for message to Kafka topic :type msg: dict message for which to generate the key :rtype : str key that has to be of type bytes """ msgwithkey = utility.add_block_fields(msg) if msgwithkey is None: return x, y = msgwithkey["block_lonid"], msgwithkey["block_latid"] return str((x * 137 + y) % 77703).encode() def produce_msgs(self): """ produces messages and sends them to topic """ msg_cnt = 0 while True: s3 = boto3.client('s3') # obj = s3.get_object(Bucket=self.s3_config["BUCKET"], # Key="{}/{}".format(self.s3_config["FOLDER"], # self.s3_config["STREAMING_FILE"])) obj = s3.get_object(Bucket='nyctaxitrip', Key="{}/{}".format( 'yellow_trip', 'yellow_tripdata_sample.csv')) for line in lazyreader.lazyread(obj['Body'], delimiter='\n'): message_info = line.strip() msg = utility.map_schema(message_info, self.schema) if msg is not None: self.producer.send(self.kafka_config["TOPIC"], value=json.dumps(msg), key=self.get_key(msg)) msg_cnt += 1 time.sleep(0.001)
class AIMSDownsamplingTCPServerConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" INTERVAL = 60 DELAY = 0 MAX_CONNECTION = 32 def __init__(self, kafka_host, kafka_port, tcp_host, tcp_port, topic, log_topic): self.kafka_host = kafka_host self.kafka_port = kafka_port self.tcp_host = tcp_host self.tcp_port = tcp_port self.topic = topic self.log_topic = log_topic self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)], enable_auto_commit=False, max_poll_records=1024 * 1024, max_partition_fetch_bytes=1024 * 1024 * 100) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)]) self.connections = {} self.sample_end_time = self.get_end_time(time()) self.lastPolled = [] def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.connection_handler, self.tcp_host, self.tcp_port) await asyncio.gather(tcpServer.serve_forever(), self.poll_from_kafka()) async def connection_handler(self, reader, writer): addr = str(writer.get_extra_info("peername")) addr = str(writer.get_extra_info("peername")) # A new connection, but we can accept no more if addr not in self.connections and \ len(self.connections)>=self.MAX_CONNECTION: self.refuse_client(addr, writer) return # Add connection self.add_client(addr, writer) # Read data from connection remaining_data = b"" try: while True: data = await reader.read(1) # 1024*8 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: self.remove_client(addr) async def poll_from_kafka(self): polled = self.consumer.poll(timeout_ms=self.INTERVAL * 1000 / 2) self.lastPolled = polled while True: t = time() if t >= self.sample_end_time + self.DELAY: polled = self.consumer.poll(timeout_ms=self.INTERVAL * 1000 / 2) lastPolled = self.lastPolled start_time = self.sample_end_time - self.INTERVAL end_time = self.sample_end_time self.lastPolled = polled self.sample_end_time = self.get_end_time(time()) if len(self.connections) != 0: # run on lastPolled first to hit the cache parsed_records = self.get_parsed_records(lastPolled) + \ self.get_parsed_records(polled) parsed_records = list( filter( lambda rec: rec["observation_date_time"] is not None, parsed_records)) ds_records = self.down_sample(parsed_records, start_time, end_time) messages = [rec["message"] for rec in ds_records] for addr in self.connections.keys(): await self.send_or_ignore_message(addr, messages) await asyncio.sleep(0.1) def get_parsed_records(self, polled): # Create cache if "_get_parsed_records__polled" not in self.__dict__: self._get_parsed_records__polled = [] if "_get_parsed_records__ret" not in self.__dict__: self._get_parsed_records__ret = [] # Cache hit if self._get_parsed_records__polled == polled: return self._get_parsed_records__ret # Cache not hit self._get_parsed_records__polled = polled self._get_parsed_records__ret = [] records = [] for recordList in polled.values(): records.extend([rec.value for rec in recordList]) for rec in records: self._get_parsed_records__ret.append(self.parse_hl7(rec)) return self._get_parsed_records__ret def parse_hl7(self, message): segments = message.decode(errors="ignore") \ .strip() \ .split(MESSAGE_SEGMENT_END_BYTE) location = None date_time = None observation_types = [] observation_type = None for seg in segments: fields = seg.split('|') if fields[0] == "PV1": try: location = fields[3] except IndexError: pass if fields[0] == "OBR": try: date_time = mktime(strptime(fields[7], "%Y%m%d%H%M%S")) except IndexError: pass if fields[0] == "OBX": try: observation_types.append(fields[13]) except IndexError: observation_types.append(None) observation_type_set = set(observation_types) if len(observation_type_set)==1 and \ "APERIODIC" in observation_type_set: observation_type = "aperiodic" if len(observation_type_set)==1 and \ None in observation_type_set: observation_type = "default" return { "assigned_patient_location": location, "observation_date_time": date_time, "observation_type": observation_type, "message": message } def down_sample(self, parsed_records, start_time, end_time): dt = [rec["observation_date_time"] for rec in parsed_records] records = [] tmp = {} sorted_records = sorted(parsed_records, key=lambda rec: rec["observation_date_time"]) for rec in sorted_records: date_time = rec["observation_date_time"] location = rec["assigned_patient_location"] observation_type = rec["observation_type"] message = rec["message"] if date_time<start_time or \ date_time>=end_time: continue tmp[location] = tmp.get(location, {}) tmp[location][observation_type] = rec for d in tmp.values(): for rec in d.values(): records.append(rec) return records def log(self, msg): self.producer.send( self.log_topic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def get_end_time(self, current_time): interval = self.INTERVAL return current_time - current_time % interval + interval async def send_or_ignore_message(self, addr, messages): writer = self.connections[addr] try: for msg in messages: writer.write(msg) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ self.remove_client(addr) def refuse_client(self, addr, writer): self.log("{} refused".format(addr)) writer.close() def add_client(self, addr, writer): if addr not in self.connections: self.log("{} accepted".format(addr)) self.connections[addr] = writer else: self.remove_client(addr) self.add_client(addr) def remove_client(self, addr): if addr in self.connections: self.log("{} closed".format(addr)) writer = self.connections.pop(addr) try: writer.close() except ConnectionResetError: pass def cleanup(self): self.log("shutdown") for addr in self.connections.keys(): self.remove_client(addr) self.producer.flush() self.producer.close()
from kafka.producer import KafkaProducer bootstrap_servers = ['localhost:9092'] topicName = 'myTopic' producer = KafkaProducer(bootstrap_servers=bootstrap_servers) producer = KafkaProducer() ack = producer.send(topicName, b'Hello World!!!!!!!!') metadata = ack.get() print(metadata.topic) print(metadata.partition)
encoding='utf8') as s3_taxi_data: read_taxi = csv.reader(s3_taxi_data, delimiter=',') next(read_taxi, None) for line in read_taxi: # data cleanup; to reduce the string length of taxi id key_string = line[1][:10] line_string = '' # to skip invalid data if not line[0] or not line[1] or not line[2] or not line[ 3] or not line[4]: continue for i in range(5): if i == 0 or i == 1: line[i] = line[i][:10] # to change the time format to be recoganized in KSQL if i == 2 or i == 3: line[i] = pd.to_datetime(line[i]) line[i] = line[i].strftime("%Y-%m-%d-%H:%M") if i == (4): delimiter = '' else: delimiter = ',' line_string = line_string + ''.join(line[i]) + delimiter print(key_string) print(line_string) producer.send('topic_fatigue', value=line_string, key=key_string) # optional; to control ingestion rate # time.sleep(1)
# Get observations from AoT observations = client.list_observations(filters=f) # Iterate through records try: for page in observations: print(f'Page {page_num}') # data_stream = [] for obs in page.data: ts = ciso8601.parse_datetime(obs["timestamp"]) prev_record_timestamp = obs["timestamp"] data_stream = { 'ts': int(time.mktime(ts.timetuple())),\ 'node_id': obs["node_vsn"],\ 'sensor_path': obs["sensor_path"],\ 'value_hrf': obs["value"]\ } producer.send(topic, value=data_stream) # Block until all the messages have been sent producer.flush() page_num += 1 except (Exception, HTTPError) as error: print(error) finally: # Write latest processed timestamp to the file fh = open("state.txt", "w+") fh.write(prev_record_timestamp) print(prev_record_timestamp) fh.close()
import time import json import boto3 from kafka.producer import KafkaProducer if __name__ == '__main__': s3 = boto3.client('s3') producer = KafkaProducer(bootstrap_servers="127.0.0.1:9092") obj = s3.get_object( Bucket='nyctaxitrip', Key="{}/{}".format('yellow_trip', 'yellow_tripdata_sample.csv')) # read s3 csv lines = str(obj['Body'].read()) for line in lines.split("\\n"): print(json.dumps(line)) producer.send( "new_topic", value=line.encode( ), # encode the value to enable kafka consumer to recieve the stream msg key=b'key' ) # encode the key to enable kafka consumer to recieve the stream msg time.sleep(0.1)
# coding=utf-8 import logging from kafka.producer import KafkaProducer if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) producer = KafkaProducer(bootstrap_servers=["192.168.120.90:9092"]) producer.send("wangybnet", b"Hello, World!")
from kafka.producer import KafkaProducer import ConfigParser import socket if __name__ == "__main__": config = ConfigParser.ConfigParser() config.read('configuration.cfg') urlKafkaProducer = config.get('StreamingProperties', 'URLKafkaProducer') topicName = config.get('StreamingProperties', 'TopicName') virtualMachine = 'local' if socket.gethostname() == 'ubuntu': virtualMachine = socket.gethostname() if virtualMachine == 'local': fileName = config.get('StreamingProperties', 'StreamingFileLocal') else: fileName = config.get('StreamingProperties', 'StreamingFileVirtual') producer = KafkaProducer(bootstrap_servers=urlKafkaProducer) infile = open (fileName, 'r') for line in infile: producer.send (topicName, line) #time.sleep(0.000000001) infile.close()
class IBUSStreamingDownsamplingConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic, logTopic, interval): self.kafkaHost = kafkaHost self.kafkaPort = kafkaPort self.tcpHost = tcpHost self.tcpPort = tcpPort self.group_id = group_id self.topic = topic self.logTopic = logTopic self.interval = int(interval) self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)], group_id=group_id, enable_auto_commit=False) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)]) self.tcpWriter = None def getTopicPartitions(self): self.consumer.topics() #This ensures local cache is updated with # information about partitions, offsets etc. pids = self.consumer.partitions_for_topic(self.topic) tps = [TopicPartition(self.topic, pid) for pid in pids] return tps def getTopicPartitionsCommittedPositions(self): tps = self.getTopicPartitions() ret = [(tp, self.consumer.committed(tp)) for tp in tps] return ret async def tcp_server_handler(self, reader, writer): addr = str(writer.get_extra_info("socket").getpeername()) if self.tcpWriter is not None: self.log("refused " + addr) writer.write(b"Connection limit reached; connection refused.") writer.close() return self.log("accepted " + addr) self.tcpWriter = writer t1 = asyncio.create_task(self.poll_from_Kafka(writer)) try: while True: data = await reader.read(1) # 1024*16 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: t1.cancel() self.log("closed " + addr) writer.close() self.tcpWriter = None async def poll_from_Kafka(self, writer): while True: prevPos = self.getTopicPartitionsCommittedPositions() polled = self.consumer.poll(timeout_ms=1000) records = [ record.value for recordList in polled.values() for record in recordList ] try: for record in records: writer.write(record) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ print("Last batch not fully sent, not commited.") for tp, pos in prevPos: self.consumer.seek(tp, pos) break else: self.consumer.commit() await asyncio.sleep(self.interval) def log(self, msg): self.producer.send( self.logTopic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def cleanup(self): self.log("shutdown") self.consumer.close() self.producer.flush() self.producer.close() def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.tcp_server_handler, self.tcpHost, self.tcpPort) await tcpServer.serve_forever()
DataId = self.increment MeterCode = random.choice(self.station_list()) TransactionId = self.increment TransactionDateTime = datetime.now(pacific_time).strftime(fmt) Amount = random.choice([0.25, 0.5, 1, 1.5, 2, 3]) PaymentMean = random.choice(['CREDIT CARD', 'PHONE', 'CASH']) max_duration_sec = 60 * 60 * 6 PaidDuration = random.randint(1, max_duration_sec) ElementKey = MeterCode record = [ DataId, MeterCode, TransactionId, TransactionDateTime, Amount, '', PaymentMean, PaidDuration, ElementKey, '2019', '4', '' ] data_send = ",".join(map(str, record)) self.increment += 1 print(data_send) key = str(MeterCode).encode() value = data_send.encode() return key, value limit = 0 datagen = GenerateData() while True: producer.send('paid-transaction', *datagen.run()) limit += 1 time.sleep(.1)
from time import sleep from kafka.producer import KafkaProducer import json producer = KafkaProducer( # set host and port that producer should contact to bootstrap initial cluster metadata bootstrap_servers=['localhost:9092'], # how data should be serialized before sending to broker (convert the data to a json file and encode it to utf-8) value_serializer=lambda x: json.dumps(x).encode('utf-8')) for e in range(1000): data = { 'number': e } # key:value pairs to send (nb: this is not the topic key). Use a key for hashed-partitioning future = producer.send('numtest', value=data) # How to make sure the message is received by the broker? print(f'sent {data}') sleep(5) # option 1: take a break # result = future.get(timeout=60) # option 2: block until a single message is sent (or timeout) # option 3: Block until all pending messages are at least put on the network. This does not guarantee delivery # or success! It is really. Only useful if you configure internal batching using linger_ms # producer.flush()
import sys import time import json import boto3 import lazyreader #import helpers from kafka.producer import KafkaProducer producer = KafkaProducer(bootstrap_servers=['localhost:9092']) while True: s3 = boto3.client('s3') obj = s3.get_object(Bucket='nyctaxi-trip-data', Key="{}/{}".format('test_data', 'test1.txt')) for line in lazyreader.lazyread(obj['Body'], delimiter='\n'): #message_info = line.strip() #msg = helpers.map_schema(message_info, self.schema) # data = {'number' : line} producer.send('read_s3', value=line) time.sleep(0.1)